diff options
Diffstat (limited to 'src/mongo/db/exec')
83 files changed, 13294 insertions, 13587 deletions
diff --git a/src/mongo/db/exec/and_common-inl.h b/src/mongo/db/exec/and_common-inl.h index 53e172d153d..c4be4684078 100644 --- a/src/mongo/db/exec/and_common-inl.h +++ b/src/mongo/db/exec/and_common-inl.h @@ -30,68 +30,69 @@ namespace mongo { - class AndCommon { - public: - /** - * If src has any data dest doesn't, add that data to dest. - */ - static void mergeFrom(WorkingSetMember* dest, const WorkingSetMember& src) { - // Both 'src' and 'dest' must have a RecordId (and they must be the same RecordId), as - // we should have just matched them according to this RecordId while doing an - // intersection. - verify(dest->hasLoc()); - verify(src.hasLoc()); - verify(dest->loc == src.loc); +class AndCommon { +public: + /** + * If src has any data dest doesn't, add that data to dest. + */ + static void mergeFrom(WorkingSetMember* dest, const WorkingSetMember& src) { + // Both 'src' and 'dest' must have a RecordId (and they must be the same RecordId), as + // we should have just matched them according to this RecordId while doing an + // intersection. + verify(dest->hasLoc()); + verify(src.hasLoc()); + verify(dest->loc == src.loc); - // Merge computed data. - typedef WorkingSetComputedDataType WSCD; - for (WSCD i = WSCD(0); i < WSM_COMPUTED_NUM_TYPES; i = WSCD(i + 1)) { - if (!dest->hasComputed(i) && src.hasComputed(i)) { - dest->addComputed(src.getComputed(i)->clone()); - } + // Merge computed data. + typedef WorkingSetComputedDataType WSCD; + for (WSCD i = WSCD(0); i < WSM_COMPUTED_NUM_TYPES; i = WSCD(i + 1)) { + if (!dest->hasComputed(i) && src.hasComputed(i)) { + dest->addComputed(src.getComputed(i)->clone()); } + } - if (dest->hasObj()) { - // The merged WSM that we're creating already has the full document, so there's - // nothing left to do. - return; - } + if (dest->hasObj()) { + // The merged WSM that we're creating already has the full document, so there's + // nothing left to do. + return; + } - if (src.hasObj()) { - // 'src' has the full document but 'dest' doesn't so we need to copy it over. - dest->obj = src.obj; + if (src.hasObj()) { + // 'src' has the full document but 'dest' doesn't so we need to copy it over. + dest->obj = src.obj; - // We have an object so we don't need key data. - dest->keyData.clear(); + // We have an object so we don't need key data. + dest->keyData.clear(); - // 'dest' should have the same state as 'src'. If 'src' has an unowned obj, then - // 'dest' also should have an unowned obj; if 'src' has an owned obj, then dest - // should also have an owned obj. - dest->state = src.state; + // 'dest' should have the same state as 'src'. If 'src' has an unowned obj, then + // 'dest' also should have an unowned obj; if 'src' has an owned obj, then dest + // should also have an owned obj. + dest->state = src.state; - // Now 'dest' has the full object. No more work to do. - return; - } + // Now 'dest' has the full object. No more work to do. + return; + } - // If we're here, then both WSMs getting merged contain index keys. We need - // to merge the key data. - // - // This is N^2 but N is probably pretty small. Easy enough to revisit. - for (size_t i = 0; i < src.keyData.size(); ++i) { - bool found = false; - for (size_t j = 0; j < dest->keyData.size(); ++j) { - if (dest->keyData[j].indexKeyPattern == src.keyData[i].indexKeyPattern) { - found = true; - break; - } + // If we're here, then both WSMs getting merged contain index keys. We need + // to merge the key data. + // + // This is N^2 but N is probably pretty small. Easy enough to revisit. + for (size_t i = 0; i < src.keyData.size(); ++i) { + bool found = false; + for (size_t j = 0; j < dest->keyData.size(); ++j) { + if (dest->keyData[j].indexKeyPattern == src.keyData[i].indexKeyPattern) { + found = true; + break; } - if (!found) { dest->keyData.push_back(src.keyData[i]); } } - - if (src.isSuspicious) - dest->isSuspicious = true; + if (!found) { + dest->keyData.push_back(src.keyData[i]); + } } - }; -} // namespace mongo + if (src.isSuspicious) + dest->isSuspicious = true; + } +}; +} // namespace mongo diff --git a/src/mongo/db/exec/and_hash.cpp b/src/mongo/db/exec/and_hash.cpp index 21d7322dd67..71084b40a31 100644 --- a/src/mongo/db/exec/and_hash.cpp +++ b/src/mongo/db/exec/and_hash.cpp @@ -36,498 +36,493 @@ namespace { - // Upper limit for buffered data. - // Stage execution will fail once size of all buffered data exceeds this threshold. - const size_t kDefaultMaxMemUsageBytes = 32 * 1024 * 1024; +// Upper limit for buffered data. +// Stage execution will fail once size of all buffered data exceeds this threshold. +const size_t kDefaultMaxMemUsageBytes = 32 * 1024 * 1024; -} // namespace +} // namespace namespace mongo { - using std::unique_ptr; - using std::vector; - - const size_t AndHashStage::kLookAheadWorks = 10; - - // static - const char* AndHashStage::kStageType = "AND_HASH"; - - AndHashStage::AndHashStage(WorkingSet* ws, const Collection* collection) - : _collection(collection), - _ws(ws), - _hashingChildren(true), - _currentChild(0), - _commonStats(kStageType), - _memUsage(0), - _maxMemUsage(kDefaultMaxMemUsageBytes) {} - - AndHashStage::AndHashStage(WorkingSet* ws, - const Collection* collection, - size_t maxMemUsage) - : _collection(collection), - _ws(ws), - _hashingChildren(true), - _currentChild(0), - _commonStats(kStageType), - _memUsage(0), - _maxMemUsage(maxMemUsage) {} - - AndHashStage::~AndHashStage() { - for (size_t i = 0; i < _children.size(); ++i) { delete _children[i]; } +using std::unique_ptr; +using std::vector; + +const size_t AndHashStage::kLookAheadWorks = 10; + +// static +const char* AndHashStage::kStageType = "AND_HASH"; + +AndHashStage::AndHashStage(WorkingSet* ws, const Collection* collection) + : _collection(collection), + _ws(ws), + _hashingChildren(true), + _currentChild(0), + _commonStats(kStageType), + _memUsage(0), + _maxMemUsage(kDefaultMaxMemUsageBytes) {} + +AndHashStage::AndHashStage(WorkingSet* ws, const Collection* collection, size_t maxMemUsage) + : _collection(collection), + _ws(ws), + _hashingChildren(true), + _currentChild(0), + _commonStats(kStageType), + _memUsage(0), + _maxMemUsage(maxMemUsage) {} + +AndHashStage::~AndHashStage() { + for (size_t i = 0; i < _children.size(); ++i) { + delete _children[i]; } +} - void AndHashStage::addChild(PlanStage* child) { _children.push_back(child); } +void AndHashStage::addChild(PlanStage* child) { + _children.push_back(child); +} - size_t AndHashStage::getMemUsage() const { - return _memUsage; - } - - bool AndHashStage::isEOF() { - // This is empty before calling work() and not-empty after. - if (_lookAheadResults.empty()) { return false; } +size_t AndHashStage::getMemUsage() const { + return _memUsage; +} - // Either we're busy hashing children, in which case we're not done yet. - if (_hashingChildren) { return false; } +bool AndHashStage::isEOF() { + // This is empty before calling work() and not-empty after. + if (_lookAheadResults.empty()) { + return false; + } - // Or we're streaming in results from the last child. + // Either we're busy hashing children, in which case we're not done yet. + if (_hashingChildren) { + return false; + } - // If there's nothing to probe against, we're EOF. - if (_dataMap.empty()) { return true; } + // Or we're streaming in results from the last child. - // Otherwise, we're done when the last child is done. - invariant(_children.size() >= 2); - return (WorkingSet::INVALID_ID == _lookAheadResults[_children.size() - 1]) - && _children[_children.size() - 1]->isEOF(); + // If there's nothing to probe against, we're EOF. + if (_dataMap.empty()) { + return true; } - PlanStage::StageState AndHashStage::work(WorkingSetID* out) { - ++_commonStats.works; + // Otherwise, we're done when the last child is done. + invariant(_children.size() >= 2); + return (WorkingSet::INVALID_ID == _lookAheadResults[_children.size() - 1]) && + _children[_children.size() - 1]->isEOF(); +} - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); +PlanStage::StageState AndHashStage::work(WorkingSetID* out) { + ++_commonStats.works; - if (isEOF()) { return PlanStage::IS_EOF; } + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); - // Fast-path for one of our children being EOF immediately. We work each child a few times. - // If it hits EOF, the AND cannot output anything. If it produces a result, we stash that - // result in _lookAheadResults. - if (_lookAheadResults.empty()) { - // INVALID_ID means that the child didn't produce a valid result. - - // We specifically are not using .resize(size, value) here because C++11 builds don't - // seem to resolve WorkingSet::INVALID_ID during linking. - _lookAheadResults.resize(_children.size()); - for (size_t i = 0; i < _children.size(); ++i) { - _lookAheadResults[i] = WorkingSet::INVALID_ID; - } + if (isEOF()) { + return PlanStage::IS_EOF; + } - // Work each child some number of times until it's either EOF or produces - // a result. If it's EOF this whole stage will be EOF. If it produces a - // result we cache it for later. - for (size_t i = 0; i < _children.size(); ++i) { - PlanStage* child = _children[i]; - for (size_t j = 0; j < kLookAheadWorks; ++j) { - StageState childStatus = child->work(&_lookAheadResults[i]); + // Fast-path for one of our children being EOF immediately. We work each child a few times. + // If it hits EOF, the AND cannot output anything. If it produces a result, we stash that + // result in _lookAheadResults. + if (_lookAheadResults.empty()) { + // INVALID_ID means that the child didn't produce a valid result. - if (PlanStage::IS_EOF == childStatus) { + // We specifically are not using .resize(size, value) here because C++11 builds don't + // seem to resolve WorkingSet::INVALID_ID during linking. + _lookAheadResults.resize(_children.size()); + for (size_t i = 0; i < _children.size(); ++i) { + _lookAheadResults[i] = WorkingSet::INVALID_ID; + } - // A child went right to EOF. Bail out. - _hashingChildren = false; - _dataMap.clear(); - return PlanStage::IS_EOF; - } - else if (PlanStage::ADVANCED == childStatus) { - // We have a result cached in _lookAheadResults[i]. Stop looking at this - // child. - break; - } - else if (PlanStage::FAILURE == childStatus || PlanStage::DEAD == childStatus) { - // Propage error to parent. - *out = _lookAheadResults[i]; - // If a stage fails, it may create a status WSM to indicate why it - // failed, in which case 'id' is valid. If ID is invalid, we - // create our own error message. - if (WorkingSet::INVALID_ID == *out) { - mongoutils::str::stream ss; - ss << "hashed AND stage failed to read in look ahead results " - << "from child " << i - << ", childStatus: " << PlanStage::stateStr(childStatus); - Status status(ErrorCodes::InternalError, ss); - *out = WorkingSetCommon::allocateStatusMember( _ws, status); - } - - _hashingChildren = false; - _dataMap.clear(); - return childStatus; + // Work each child some number of times until it's either EOF or produces + // a result. If it's EOF this whole stage will be EOF. If it produces a + // result we cache it for later. + for (size_t i = 0; i < _children.size(); ++i) { + PlanStage* child = _children[i]; + for (size_t j = 0; j < kLookAheadWorks; ++j) { + StageState childStatus = child->work(&_lookAheadResults[i]); + + if (PlanStage::IS_EOF == childStatus) { + // A child went right to EOF. Bail out. + _hashingChildren = false; + _dataMap.clear(); + return PlanStage::IS_EOF; + } else if (PlanStage::ADVANCED == childStatus) { + // We have a result cached in _lookAheadResults[i]. Stop looking at this + // child. + break; + } else if (PlanStage::FAILURE == childStatus || PlanStage::DEAD == childStatus) { + // Propage error to parent. + *out = _lookAheadResults[i]; + // If a stage fails, it may create a status WSM to indicate why it + // failed, in which case 'id' is valid. If ID is invalid, we + // create our own error message. + if (WorkingSet::INVALID_ID == *out) { + mongoutils::str::stream ss; + ss << "hashed AND stage failed to read in look ahead results " + << "from child " << i + << ", childStatus: " << PlanStage::stateStr(childStatus); + Status status(ErrorCodes::InternalError, ss); + *out = WorkingSetCommon::allocateStatusMember(_ws, status); } - // We ignore NEED_TIME. TODO: what do we want to do if we get NEED_YIELD here? + + _hashingChildren = false; + _dataMap.clear(); + return childStatus; } + // We ignore NEED_TIME. TODO: what do we want to do if we get NEED_YIELD here? } - - // We did a bunch of work above, return NEED_TIME to be fair. - return PlanStage::NEED_TIME; } - // An AND is either reading the first child into the hash table, probing against the hash - // table with subsequent children, or checking the last child's results to see if they're - // in the hash table. + // We did a bunch of work above, return NEED_TIME to be fair. + return PlanStage::NEED_TIME; + } - // We read the first child into our hash table. - if (_hashingChildren) { - // Check memory usage of previously hashed results. - if (_memUsage > _maxMemUsage) { - mongoutils::str::stream ss; - ss << "hashed AND stage buffered data usage of " << _memUsage - << " bytes exceeds internal limit of " << kDefaultMaxMemUsageBytes << " bytes"; - Status status(ErrorCodes::Overflow, ss); - *out = WorkingSetCommon::allocateStatusMember( _ws, status); - return PlanStage::FAILURE; - } + // An AND is either reading the first child into the hash table, probing against the hash + // table with subsequent children, or checking the last child's results to see if they're + // in the hash table. + + // We read the first child into our hash table. + if (_hashingChildren) { + // Check memory usage of previously hashed results. + if (_memUsage > _maxMemUsage) { + mongoutils::str::stream ss; + ss << "hashed AND stage buffered data usage of " << _memUsage + << " bytes exceeds internal limit of " << kDefaultMaxMemUsageBytes << " bytes"; + Status status(ErrorCodes::Overflow, ss); + *out = WorkingSetCommon::allocateStatusMember(_ws, status); + return PlanStage::FAILURE; + } - if (0 == _currentChild) { - return readFirstChild(out); - } - else if (_currentChild < _children.size() - 1) { - return hashOtherChildren(out); - } - else { - _hashingChildren = false; - // We don't hash our last child. Instead, we probe the table created from the - // previous children, returning results in the order of the last child. - // Fall through to below. - } + if (0 == _currentChild) { + return readFirstChild(out); + } else if (_currentChild < _children.size() - 1) { + return hashOtherChildren(out); + } else { + _hashingChildren = false; + // We don't hash our last child. Instead, we probe the table created from the + // previous children, returning results in the order of the last child. + // Fall through to below. } + } - // Returning results. We read from the last child and return the results that are in our - // hash map. + // Returning results. We read from the last child and return the results that are in our + // hash map. - // We should be EOF if we're not hashing results and the dataMap is empty. - verify(!_dataMap.empty()); + // We should be EOF if we're not hashing results and the dataMap is empty. + verify(!_dataMap.empty()); - // We probe _dataMap with the last child. - verify(_currentChild == _children.size() - 1); + // We probe _dataMap with the last child. + verify(_currentChild == _children.size() - 1); - // Get the next result for the (_children.size() - 1)-th child. - StageState childStatus = workChild(_children.size() - 1, out); - if (PlanStage::ADVANCED != childStatus) { - return childStatus; - } + // Get the next result for the (_children.size() - 1)-th child. + StageState childStatus = workChild(_children.size() - 1, out); + if (PlanStage::ADVANCED != childStatus) { + return childStatus; + } + + // We know that we've ADVANCED. See if the WSM is in our table. + WorkingSetMember* member = _ws->get(*out); + + // Maybe the child had an invalidation. We intersect RecordId(s) so we can't do anything + // with this WSM. + if (!member->hasLoc()) { + _ws->flagForReview(*out); + return PlanStage::NEED_TIME; + } - // We know that we've ADVANCED. See if the WSM is in our table. - WorkingSetMember* member = _ws->get(*out); + DataMap::iterator it = _dataMap.find(member->loc); + if (_dataMap.end() == it) { + // Child's output wasn't in every previous child. Throw it out. + _ws->free(*out); + ++_commonStats.needTime; + return PlanStage::NEED_TIME; + } else { + // Child's output was in every previous child. Merge any key data in + // the child's output and free the child's just-outputted WSM. + WorkingSetID hashID = it->second; + _dataMap.erase(it); + + WorkingSetMember* olderMember = _ws->get(hashID); + AndCommon::mergeFrom(olderMember, *member); + _ws->free(*out); + + ++_commonStats.advanced; + *out = hashID; + return PlanStage::ADVANCED; + } +} + +PlanStage::StageState AndHashStage::workChild(size_t childNo, WorkingSetID* out) { + if (WorkingSet::INVALID_ID != _lookAheadResults[childNo]) { + *out = _lookAheadResults[childNo]; + _lookAheadResults[childNo] = WorkingSet::INVALID_ID; + return PlanStage::ADVANCED; + } else { + return _children[childNo]->work(out); + } +} + +PlanStage::StageState AndHashStage::readFirstChild(WorkingSetID* out) { + verify(_currentChild == 0); + + WorkingSetID id = WorkingSet::INVALID_ID; + StageState childStatus = workChild(0, &id); + + if (PlanStage::ADVANCED == childStatus) { + WorkingSetMember* member = _ws->get(id); // Maybe the child had an invalidation. We intersect RecordId(s) so we can't do anything // with this WSM. if (!member->hasLoc()) { - _ws->flagForReview(*out); + _ws->flagForReview(id); return PlanStage::NEED_TIME; } - DataMap::iterator it = _dataMap.find(member->loc); - if (_dataMap.end() == it) { - // Child's output wasn't in every previous child. Throw it out. - _ws->free(*out); + if (!_dataMap.insert(std::make_pair(member->loc, id)).second) { + // Didn't insert because we already had this loc inside the map. This should only + // happen if we're seeing a newer copy of the same doc in a more recent snapshot. + // Throw out the newer copy of the doc. + _ws->free(id); ++_commonStats.needTime; return PlanStage::NEED_TIME; } - else { - // Child's output was in every previous child. Merge any key data in - // the child's output and free the child's just-outputted WSM. - WorkingSetID hashID = it->second; - _dataMap.erase(it); - WorkingSetMember* olderMember = _ws->get(hashID); - AndCommon::mergeFrom(olderMember, *member); - _ws->free(*out); + // Update memory stats. + _memUsage += member->getMemUsage(); - ++_commonStats.advanced; - *out = hashID; - return PlanStage::ADVANCED; - } - } + ++_commonStats.needTime; + return PlanStage::NEED_TIME; + } else if (PlanStage::IS_EOF == childStatus) { + // Done reading child 0. + _currentChild = 1; - PlanStage::StageState AndHashStage::workChild(size_t childNo, WorkingSetID* out) { - if (WorkingSet::INVALID_ID != _lookAheadResults[childNo]) { - *out = _lookAheadResults[childNo]; - _lookAheadResults[childNo] = WorkingSet::INVALID_ID; - return PlanStage::ADVANCED; - } - else { - return _children[childNo]->work(out); + // If our first child was empty, don't scan any others, no possible results. + if (_dataMap.empty()) { + _hashingChildren = false; + return PlanStage::IS_EOF; } - } - - PlanStage::StageState AndHashStage::readFirstChild(WorkingSetID* out) { - verify(_currentChild == 0); - - WorkingSetID id = WorkingSet::INVALID_ID; - StageState childStatus = workChild(0, &id); - if (PlanStage::ADVANCED == childStatus) { - WorkingSetMember* member = _ws->get(id); - - // Maybe the child had an invalidation. We intersect RecordId(s) so we can't do anything - // with this WSM. - if (!member->hasLoc()) { - _ws->flagForReview(id); - return PlanStage::NEED_TIME; - } - - if (!_dataMap.insert(std::make_pair(member->loc, id)).second) { - // Didn't insert because we already had this loc inside the map. This should only - // happen if we're seeing a newer copy of the same doc in a more recent snapshot. - // Throw out the newer copy of the doc. - _ws->free(id); - ++_commonStats.needTime; - return PlanStage::NEED_TIME; - } - - // Update memory stats. - _memUsage += member->getMemUsage(); - - ++_commonStats.needTime; - return PlanStage::NEED_TIME; + ++_commonStats.needTime; + _specificStats.mapAfterChild.push_back(_dataMap.size()); + + return PlanStage::NEED_TIME; + } else if (PlanStage::FAILURE == childStatus || PlanStage::DEAD == childStatus) { + *out = id; + // If a stage fails, it may create a status WSM to indicate why it + // failed, in which case 'id' is valid. If ID is invalid, we + // create our own error message. + if (WorkingSet::INVALID_ID == id) { + mongoutils::str::stream ss; + ss << "hashed AND stage failed to read in results to from first child"; + Status status(ErrorCodes::InternalError, ss); + *out = WorkingSetCommon::allocateStatusMember(_ws, status); } - else if (PlanStage::IS_EOF == childStatus) { - // Done reading child 0. - _currentChild = 1; - - // If our first child was empty, don't scan any others, no possible results. - if (_dataMap.empty()) { - _hashingChildren = false; - return PlanStage::IS_EOF; - } - + return childStatus; + } else { + if (PlanStage::NEED_TIME == childStatus) { ++_commonStats.needTime; - _specificStats.mapAfterChild.push_back(_dataMap.size()); - - return PlanStage::NEED_TIME; - } - else if (PlanStage::FAILURE == childStatus || PlanStage::DEAD == childStatus) { + } else if (PlanStage::NEED_YIELD == childStatus) { + ++_commonStats.needYield; *out = id; - // If a stage fails, it may create a status WSM to indicate why it - // failed, in which case 'id' is valid. If ID is invalid, we - // create our own error message. - if (WorkingSet::INVALID_ID == id) { - mongoutils::str::stream ss; - ss << "hashed AND stage failed to read in results to from first child"; - Status status(ErrorCodes::InternalError, ss); - *out = WorkingSetCommon::allocateStatusMember( _ws, status); - } - return childStatus; } - else { - if (PlanStage::NEED_TIME == childStatus) { - ++_commonStats.needTime; - } - else if (PlanStage::NEED_YIELD == childStatus) { - ++_commonStats.needYield; - *out = id; - } - return childStatus; - } + return childStatus; } +} - PlanStage::StageState AndHashStage::hashOtherChildren(WorkingSetID* out) { - verify(_currentChild > 0); +PlanStage::StageState AndHashStage::hashOtherChildren(WorkingSetID* out) { + verify(_currentChild > 0); - WorkingSetID id = WorkingSet::INVALID_ID; - StageState childStatus = workChild(_currentChild, &id); + WorkingSetID id = WorkingSet::INVALID_ID; + StageState childStatus = workChild(_currentChild, &id); - if (PlanStage::ADVANCED == childStatus) { - WorkingSetMember* member = _ws->get(id); + if (PlanStage::ADVANCED == childStatus) { + WorkingSetMember* member = _ws->get(id); - // Maybe the child had an invalidation. We intersect RecordId(s) so we can't do anything - // with this WSM. - if (!member->hasLoc()) { - _ws->flagForReview(id); - return PlanStage::NEED_TIME; - } + // Maybe the child had an invalidation. We intersect RecordId(s) so we can't do anything + // with this WSM. + if (!member->hasLoc()) { + _ws->flagForReview(id); + return PlanStage::NEED_TIME; + } - verify(member->hasLoc()); - if (_dataMap.end() == _dataMap.find(member->loc)) { - // Ignore. It's not in any previous child. - } - else { - // We have a hit. Copy data into the WSM we already have. - _seenMap.insert(member->loc); - WorkingSetMember* olderMember = _ws->get(_dataMap[member->loc]); - size_t memUsageBefore = olderMember->getMemUsage(); + verify(member->hasLoc()); + if (_dataMap.end() == _dataMap.find(member->loc)) { + // Ignore. It's not in any previous child. + } else { + // We have a hit. Copy data into the WSM we already have. + _seenMap.insert(member->loc); + WorkingSetMember* olderMember = _ws->get(_dataMap[member->loc]); + size_t memUsageBefore = olderMember->getMemUsage(); + + AndCommon::mergeFrom(olderMember, *member); - AndCommon::mergeFrom(olderMember, *member); + // Update memory stats. + _memUsage += olderMember->getMemUsage() - memUsageBefore; + } + _ws->free(id); + ++_commonStats.needTime; + return PlanStage::NEED_TIME; + } else if (PlanStage::IS_EOF == childStatus) { + // Finished with a child. + ++_currentChild; + + // Keep elements of _dataMap that are in _seenMap. + DataMap::iterator it = _dataMap.begin(); + while (it != _dataMap.end()) { + if (_seenMap.end() == _seenMap.find(it->first)) { + DataMap::iterator toErase = it; + ++it; // Update memory stats. - _memUsage += olderMember->getMemUsage() - memUsageBefore; + WorkingSetMember* member = _ws->get(toErase->second); + _memUsage -= member->getMemUsage(); + + _ws->free(toErase->second); + _dataMap.erase(toErase); + } else { + ++it; } - _ws->free(id); - ++_commonStats.needTime; - return PlanStage::NEED_TIME; } - else if (PlanStage::IS_EOF == childStatus) { - // Finished with a child. - ++_currentChild; - - // Keep elements of _dataMap that are in _seenMap. - DataMap::iterator it = _dataMap.begin(); - while (it != _dataMap.end()) { - if (_seenMap.end() == _seenMap.find(it->first)) { - DataMap::iterator toErase = it; - ++it; - - // Update memory stats. - WorkingSetMember* member = _ws->get(toErase->second); - _memUsage -= member->getMemUsage(); - - _ws->free(toErase->second); - _dataMap.erase(toErase); - } - else { ++it; } - } - _specificStats.mapAfterChild.push_back(_dataMap.size()); + _specificStats.mapAfterChild.push_back(_dataMap.size()); - _seenMap.clear(); + _seenMap.clear(); - // _dataMap is now the intersection of the first _currentChild nodes. + // _dataMap is now the intersection of the first _currentChild nodes. - // If we have nothing to AND with after finishing any child, stop. - if (_dataMap.empty()) { - _hashingChildren = false; - return PlanStage::IS_EOF; - } + // If we have nothing to AND with after finishing any child, stop. + if (_dataMap.empty()) { + _hashingChildren = false; + return PlanStage::IS_EOF; + } - // We've finished scanning all children. Return results with the next call to work(). - if (_currentChild == _children.size()) { - _hashingChildren = false; - } + // We've finished scanning all children. Return results with the next call to work(). + if (_currentChild == _children.size()) { + _hashingChildren = false; + } - ++_commonStats.needTime; - return PlanStage::NEED_TIME; + ++_commonStats.needTime; + return PlanStage::NEED_TIME; + } else if (PlanStage::FAILURE == childStatus || PlanStage::DEAD == childStatus) { + *out = id; + // If a stage fails, it may create a status WSM to indicate why it + // failed, in which case 'id' is valid. If ID is invalid, we + // create our own error message. + if (WorkingSet::INVALID_ID == id) { + mongoutils::str::stream ss; + ss << "hashed AND stage failed to read in results from other child " << _currentChild; + Status status(ErrorCodes::InternalError, ss); + *out = WorkingSetCommon::allocateStatusMember(_ws, status); } - else if (PlanStage::FAILURE == childStatus || PlanStage::DEAD == childStatus) { + return childStatus; + } else { + if (PlanStage::NEED_TIME == childStatus) { + ++_commonStats.needTime; + } else if (PlanStage::NEED_YIELD == childStatus) { + ++_commonStats.needYield; *out = id; - // If a stage fails, it may create a status WSM to indicate why it - // failed, in which case 'id' is valid. If ID is invalid, we - // create our own error message. - if (WorkingSet::INVALID_ID == id) { - mongoutils::str::stream ss; - ss << "hashed AND stage failed to read in results from other child " - << _currentChild; - Status status(ErrorCodes::InternalError, ss); - *out = WorkingSetCommon::allocateStatusMember( _ws, status); - } - return childStatus; } - else { - if (PlanStage::NEED_TIME == childStatus) { - ++_commonStats.needTime; - } - else if (PlanStage::NEED_YIELD == childStatus) { - ++_commonStats.needYield; - *out = id; - } - return childStatus; - } + return childStatus; } +} - void AndHashStage::saveState() { - ++_commonStats.yields; +void AndHashStage::saveState() { + ++_commonStats.yields; - for (size_t i = 0; i < _children.size(); ++i) { - _children[i]->saveState(); - } + for (size_t i = 0; i < _children.size(); ++i) { + _children[i]->saveState(); } +} - void AndHashStage::restoreState(OperationContext* opCtx) { - ++_commonStats.unyields; +void AndHashStage::restoreState(OperationContext* opCtx) { + ++_commonStats.unyields; - for (size_t i = 0; i < _children.size(); ++i) { - _children[i]->restoreState(opCtx); - } + for (size_t i = 0; i < _children.size(); ++i) { + _children[i]->restoreState(opCtx); } +} - void AndHashStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { - ++_commonStats.invalidates; +void AndHashStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + ++_commonStats.invalidates; - if (isEOF()) { return; } + if (isEOF()) { + return; + } - for (size_t i = 0; i < _children.size(); ++i) { - _children[i]->invalidate(txn, dl, type); - } + for (size_t i = 0; i < _children.size(); ++i) { + _children[i]->invalidate(txn, dl, type); + } - // Invalidation can happen to our warmup results. If that occurs just - // flag it and forget about it. - for (size_t i = 0; i < _lookAheadResults.size(); ++i) { - if (WorkingSet::INVALID_ID != _lookAheadResults[i]) { - WorkingSetMember* member = _ws->get(_lookAheadResults[i]); - if (member->hasLoc() && member->loc == dl) { - WorkingSetCommon::fetchAndInvalidateLoc(txn, member, _collection); - _ws->flagForReview(_lookAheadResults[i]); - _lookAheadResults[i] = WorkingSet::INVALID_ID; - } + // Invalidation can happen to our warmup results. If that occurs just + // flag it and forget about it. + for (size_t i = 0; i < _lookAheadResults.size(); ++i) { + if (WorkingSet::INVALID_ID != _lookAheadResults[i]) { + WorkingSetMember* member = _ws->get(_lookAheadResults[i]); + if (member->hasLoc() && member->loc == dl) { + WorkingSetCommon::fetchAndInvalidateLoc(txn, member, _collection); + _ws->flagForReview(_lookAheadResults[i]); + _lookAheadResults[i] = WorkingSet::INVALID_ID; } } + } - // If it's a deletion, we have to forget about the RecordId, and since the AND-ing is by - // RecordId we can't continue processing it even with the object. - // - // If it's a mutation the predicates implied by the AND-ing may no longer be true. - // - // So, we flag and try to pick it up later. - DataMap::iterator it = _dataMap.find(dl); - if (_dataMap.end() != it) { - WorkingSetID id = it->second; - WorkingSetMember* member = _ws->get(id); - verify(member->loc == dl); - - if (_hashingChildren) { - ++_specificStats.flaggedInProgress; - } - else { - ++_specificStats.flaggedButPassed; - } + // If it's a deletion, we have to forget about the RecordId, and since the AND-ing is by + // RecordId we can't continue processing it even with the object. + // + // If it's a mutation the predicates implied by the AND-ing may no longer be true. + // + // So, we flag and try to pick it up later. + DataMap::iterator it = _dataMap.find(dl); + if (_dataMap.end() != it) { + WorkingSetID id = it->second; + WorkingSetMember* member = _ws->get(id); + verify(member->loc == dl); - // Update memory stats. - _memUsage -= member->getMemUsage(); + if (_hashingChildren) { + ++_specificStats.flaggedInProgress; + } else { + ++_specificStats.flaggedButPassed; + } - // The loc is about to be invalidated. Fetch it and clear the loc. - WorkingSetCommon::fetchAndInvalidateLoc(txn, member, _collection); + // Update memory stats. + _memUsage -= member->getMemUsage(); - // Add the WSID to the to-be-reviewed list in the WS. - _ws->flagForReview(id); + // The loc is about to be invalidated. Fetch it and clear the loc. + WorkingSetCommon::fetchAndInvalidateLoc(txn, member, _collection); - // And don't return it from this stage. - _dataMap.erase(it); - } - } + // Add the WSID to the to-be-reviewed list in the WS. + _ws->flagForReview(id); - vector<PlanStage*> AndHashStage::getChildren() const { - return _children; + // And don't return it from this stage. + _dataMap.erase(it); } +} - PlanStageStats* AndHashStage::getStats() { - _commonStats.isEOF = isEOF(); +vector<PlanStage*> AndHashStage::getChildren() const { + return _children; +} - _specificStats.memLimit = _maxMemUsage; - _specificStats.memUsage = _memUsage; +PlanStageStats* AndHashStage::getStats() { + _commonStats.isEOF = isEOF(); - unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_AND_HASH)); - ret->specific.reset(new AndHashStats(_specificStats)); - for (size_t i = 0; i < _children.size(); ++i) { - ret->children.push_back(_children[i]->getStats()); - } + _specificStats.memLimit = _maxMemUsage; + _specificStats.memUsage = _memUsage; - return ret.release(); + unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_AND_HASH)); + ret->specific.reset(new AndHashStats(_specificStats)); + for (size_t i = 0; i < _children.size(); ++i) { + ret->children.push_back(_children[i]->getStats()); } - const CommonStats* AndHashStage::getCommonStats() const { - return &_commonStats; - } + return ret.release(); +} - const SpecificStats* AndHashStage::getSpecificStats() const { - return &_specificStats; - } +const CommonStats* AndHashStage::getCommonStats() const { + return &_commonStats; +} + +const SpecificStats* AndHashStage::getSpecificStats() const { + return &_specificStats; +} } // namespace mongo diff --git a/src/mongo/db/exec/and_hash.h b/src/mongo/db/exec/and_hash.h index 16e7ec3759f..efe625619db 100644 --- a/src/mongo/db/exec/and_hash.h +++ b/src/mongo/db/exec/and_hash.h @@ -38,106 +38,106 @@ namespace mongo { +/** + * Reads from N children, each of which must have a valid RecordId. Uses a hash table to + * intersect the outputs of the N children, and outputs the intersection. + * + * Preconditions: Valid RecordId. More than one child. + * + * Any RecordId that we keep a reference to that is invalidated before we are able to return it + * is fetched and added to the WorkingSet as "flagged for further review." Because this stage + * operates with RecordIds, we are unable to evaluate the AND for the invalidated RecordId, and it + * must be fully matched later. + */ +class AndHashStage : public PlanStage { +public: + AndHashStage(WorkingSet* ws, const Collection* collection); + /** - * Reads from N children, each of which must have a valid RecordId. Uses a hash table to - * intersect the outputs of the N children, and outputs the intersection. - * - * Preconditions: Valid RecordId. More than one child. - * - * Any RecordId that we keep a reference to that is invalidated before we are able to return it - * is fetched and added to the WorkingSet as "flagged for further review." Because this stage - * operates with RecordIds, we are unable to evaluate the AND for the invalidated RecordId, and it - * must be fully matched later. + * For testing only. Allows tests to set memory usage threshold. */ - class AndHashStage : public PlanStage { - public: - AndHashStage(WorkingSet* ws, const Collection* collection); - - /** - * For testing only. Allows tests to set memory usage threshold. - */ - AndHashStage(WorkingSet* ws, - const Collection* collection, - size_t maxMemUsage); + AndHashStage(WorkingSet* ws, const Collection* collection, size_t maxMemUsage); - virtual ~AndHashStage(); + virtual ~AndHashStage(); - void addChild(PlanStage* child); + void addChild(PlanStage* child); - /** - * Returns memory usage. - * For testing only. - */ - size_t getMemUsage() const; + /** + * Returns memory usage. + * For testing only. + */ + size_t getMemUsage() const; - virtual StageState work(WorkingSetID* out); - virtual bool isEOF(); + virtual StageState work(WorkingSetID* out); + virtual bool isEOF(); - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - virtual std::vector<PlanStage*> getChildren() const; + virtual std::vector<PlanStage*> getChildren() const; - virtual StageType stageType() const { return STAGE_AND_HASH; } + virtual StageType stageType() const { + return STAGE_AND_HASH; + } - virtual PlanStageStats* getStats(); + virtual PlanStageStats* getStats(); - virtual const CommonStats* getCommonStats() const; + virtual const CommonStats* getCommonStats() const; - virtual const SpecificStats* getSpecificStats() const; + virtual const SpecificStats* getSpecificStats() const; - static const char* kStageType; + static const char* kStageType; - private: - static const size_t kLookAheadWorks; +private: + static const size_t kLookAheadWorks; - StageState readFirstChild(WorkingSetID* out); - StageState hashOtherChildren(WorkingSetID* out); - StageState workChild(size_t childNo, WorkingSetID* out); + StageState readFirstChild(WorkingSetID* out); + StageState hashOtherChildren(WorkingSetID* out); + StageState workChild(size_t childNo, WorkingSetID* out); - // Not owned by us. - const Collection* _collection; + // Not owned by us. + const Collection* _collection; - // Not owned by us. - WorkingSet* _ws; + // Not owned by us. + WorkingSet* _ws; - // The stages we read from. Owned by us. - std::vector<PlanStage*> _children; + // The stages we read from. Owned by us. + std::vector<PlanStage*> _children; - // We want to see if any of our children are EOF immediately. This requires working them a - // few times to see if they hit EOF or if they produce a result. If they produce a result, - // we place that result here. - std::vector<WorkingSetID> _lookAheadResults; + // We want to see if any of our children are EOF immediately. This requires working them a + // few times to see if they hit EOF or if they produce a result. If they produce a result, + // we place that result here. + std::vector<WorkingSetID> _lookAheadResults; - // _dataMap is filled out by the first child and probed by subsequent children. This is the - // hash table that we create by intersecting _children and probe with the last child. - typedef unordered_map<RecordId, WorkingSetID, RecordId::Hasher> DataMap; - DataMap _dataMap; + // _dataMap is filled out by the first child and probed by subsequent children. This is the + // hash table that we create by intersecting _children and probe with the last child. + typedef unordered_map<RecordId, WorkingSetID, RecordId::Hasher> DataMap; + DataMap _dataMap; - // Keeps track of what elements from _dataMap subsequent children have seen. - // Only used while _hashingChildren. - typedef unordered_set<RecordId, RecordId::Hasher> SeenMap; - SeenMap _seenMap; + // Keeps track of what elements from _dataMap subsequent children have seen. + // Only used while _hashingChildren. + typedef unordered_set<RecordId, RecordId::Hasher> SeenMap; + SeenMap _seenMap; - // True if we're still intersecting _children[0..._children.size()-1]. - bool _hashingChildren; + // True if we're still intersecting _children[0..._children.size()-1]. + bool _hashingChildren; - // Which child are we currently working on? - size_t _currentChild; + // Which child are we currently working on? + size_t _currentChild; - // Stats - CommonStats _commonStats; - AndHashStats _specificStats; + // Stats + CommonStats _commonStats; + AndHashStats _specificStats; - // The usage in bytes of all buffered data that we're holding. - // Memory usage is calculated from keys held in _dataMap only. - // For simplicity, results in _lookAheadResults do not count towards the limit. - size_t _memUsage; + // The usage in bytes of all buffered data that we're holding. + // Memory usage is calculated from keys held in _dataMap only. + // For simplicity, results in _lookAheadResults do not count towards the limit. + size_t _memUsage; - // Upper limit for buffered data memory usage. - // Defaults to 32 MB (See kMaxBytes in and_hash.cpp). - size_t _maxMemUsage; - }; + // Upper limit for buffered data memory usage. + // Defaults to 32 MB (See kMaxBytes in and_hash.cpp). + size_t _maxMemUsage; +}; } // namespace mongo diff --git a/src/mongo/db/exec/and_sorted.cpp b/src/mongo/db/exec/and_sorted.cpp index c407b9c842c..27966791c87 100644 --- a/src/mongo/db/exec/and_sorted.cpp +++ b/src/mongo/db/exec/and_sorted.cpp @@ -35,292 +35,289 @@ namespace mongo { - using std::unique_ptr; - using std::numeric_limits; - using std::vector; - - // static - const char* AndSortedStage::kStageType = "AND_SORTED"; - - AndSortedStage::AndSortedStage(WorkingSet* ws, const Collection* collection) - : _collection(collection), - _ws(ws), - _targetNode(numeric_limits<size_t>::max()), - _targetId(WorkingSet::INVALID_ID), _isEOF(false), - _commonStats(kStageType) { } - - AndSortedStage::~AndSortedStage() { - for (size_t i = 0; i < _children.size(); ++i) { delete _children[i]; } +using std::unique_ptr; +using std::numeric_limits; +using std::vector; + +// static +const char* AndSortedStage::kStageType = "AND_SORTED"; + +AndSortedStage::AndSortedStage(WorkingSet* ws, const Collection* collection) + : _collection(collection), + _ws(ws), + _targetNode(numeric_limits<size_t>::max()), + _targetId(WorkingSet::INVALID_ID), + _isEOF(false), + _commonStats(kStageType) {} + +AndSortedStage::~AndSortedStage() { + for (size_t i = 0; i < _children.size(); ++i) { + delete _children[i]; } +} - void AndSortedStage::addChild(PlanStage* child) { - _children.push_back(child); - } - - bool AndSortedStage::isEOF() { return _isEOF; } +void AndSortedStage::addChild(PlanStage* child) { + _children.push_back(child); +} - PlanStage::StageState AndSortedStage::work(WorkingSetID* out) { - ++_commonStats.works; +bool AndSortedStage::isEOF() { + return _isEOF; +} - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); +PlanStage::StageState AndSortedStage::work(WorkingSetID* out) { + ++_commonStats.works; - if (isEOF()) { return PlanStage::IS_EOF; } + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); - if (0 == _specificStats.failedAnd.size()) { - _specificStats.failedAnd.resize(_children.size()); - } + if (isEOF()) { + return PlanStage::IS_EOF; + } - // If we don't have any nodes that we're work()-ing until they hit a certain RecordId... - if (0 == _workingTowardRep.size()) { - // Get a target RecordId. - return getTargetLoc(out); - } + if (0 == _specificStats.failedAnd.size()) { + _specificStats.failedAnd.resize(_children.size()); + } - // Move nodes toward the target RecordId. - // If all nodes reach the target RecordId, return it. The next call to work() will set a new - // target. - return moveTowardTargetLoc(out); + // If we don't have any nodes that we're work()-ing until they hit a certain RecordId... + if (0 == _workingTowardRep.size()) { + // Get a target RecordId. + return getTargetLoc(out); } - PlanStage::StageState AndSortedStage::getTargetLoc(WorkingSetID* out) { - verify(numeric_limits<size_t>::max() == _targetNode); - verify(WorkingSet::INVALID_ID == _targetId); - verify(RecordId() == _targetLoc); + // Move nodes toward the target RecordId. + // If all nodes reach the target RecordId, return it. The next call to work() will set a new + // target. + return moveTowardTargetLoc(out); +} - // Pick one, and get a loc to work toward. - WorkingSetID id = WorkingSet::INVALID_ID; - StageState state = _children[0]->work(&id); +PlanStage::StageState AndSortedStage::getTargetLoc(WorkingSetID* out) { + verify(numeric_limits<size_t>::max() == _targetNode); + verify(WorkingSet::INVALID_ID == _targetId); + verify(RecordId() == _targetLoc); - if (PlanStage::ADVANCED == state) { - WorkingSetMember* member = _ws->get(id); + // Pick one, and get a loc to work toward. + WorkingSetID id = WorkingSet::INVALID_ID; + StageState state = _children[0]->work(&id); - // Maybe the child had an invalidation. We intersect RecordId(s) so we can't do anything - // with this WSM. - if (!member->hasLoc()) { - _ws->flagForReview(id); - return PlanStage::NEED_TIME; - } + if (PlanStage::ADVANCED == state) { + WorkingSetMember* member = _ws->get(id); - verify(member->hasLoc()); + // Maybe the child had an invalidation. We intersect RecordId(s) so we can't do anything + // with this WSM. + if (!member->hasLoc()) { + _ws->flagForReview(id); + return PlanStage::NEED_TIME; + } - // We have a value from one child to AND with. - _targetNode = 0; - _targetId = id; - _targetLoc = member->loc; + verify(member->hasLoc()); - // We have to AND with all other children. - for (size_t i = 1; i < _children.size(); ++i) { - _workingTowardRep.push(i); - } + // We have a value from one child to AND with. + _targetNode = 0; + _targetId = id; + _targetLoc = member->loc; - ++_commonStats.needTime; - return PlanStage::NEED_TIME; + // We have to AND with all other children. + for (size_t i = 1; i < _children.size(); ++i) { + _workingTowardRep.push(i); } - else if (PlanStage::IS_EOF == state) { - _isEOF = true; - return state; + + ++_commonStats.needTime; + return PlanStage::NEED_TIME; + } else if (PlanStage::IS_EOF == state) { + _isEOF = true; + return state; + } else if (PlanStage::FAILURE == state) { + *out = id; + // If a stage fails, it may create a status WSM to indicate why it + // failed, in which case 'id' is valid. If ID is invalid, we + // create our own error message. + if (WorkingSet::INVALID_ID == id) { + mongoutils::str::stream ss; + ss << "sorted AND stage failed to read in results from first child"; + Status status(ErrorCodes::InternalError, ss); + *out = WorkingSetCommon::allocateStatusMember(_ws, status); } - else if (PlanStage::FAILURE == state) { + _isEOF = true; + return state; + } else { + if (PlanStage::NEED_TIME == state) { + ++_commonStats.needTime; + } else if (PlanStage::NEED_YIELD == state) { + ++_commonStats.needYield; *out = id; - // If a stage fails, it may create a status WSM to indicate why it - // failed, in which case 'id' is valid. If ID is invalid, we - // create our own error message. - if (WorkingSet::INVALID_ID == id) { - mongoutils::str::stream ss; - ss << "sorted AND stage failed to read in results from first child"; - Status status(ErrorCodes::InternalError, ss); - *out = WorkingSetCommon::allocateStatusMember( _ws, status); - } - _isEOF = true; - return state; } - else { - if (PlanStage::NEED_TIME == state) { - ++_commonStats.needTime; - } - else if (PlanStage::NEED_YIELD == state) { - ++_commonStats.needYield; - *out = id; - } - // NEED_TIME, NEED_YIELD. - return state; - } + // NEED_TIME, NEED_YIELD. + return state; } +} - PlanStage::StageState AndSortedStage::moveTowardTargetLoc(WorkingSetID* out) { - verify(numeric_limits<size_t>::max() != _targetNode); - verify(WorkingSet::INVALID_ID != _targetId); +PlanStage::StageState AndSortedStage::moveTowardTargetLoc(WorkingSetID* out) { + verify(numeric_limits<size_t>::max() != _targetNode); + verify(WorkingSet::INVALID_ID != _targetId); - // We have nodes that haven't hit _targetLoc yet. - size_t workingChildNumber = _workingTowardRep.front(); - PlanStage* next = _children[workingChildNumber]; - WorkingSetID id = WorkingSet::INVALID_ID; - StageState state = next->work(&id); + // We have nodes that haven't hit _targetLoc yet. + size_t workingChildNumber = _workingTowardRep.front(); + PlanStage* next = _children[workingChildNumber]; + WorkingSetID id = WorkingSet::INVALID_ID; + StageState state = next->work(&id); - if (PlanStage::ADVANCED == state) { - WorkingSetMember* member = _ws->get(id); + if (PlanStage::ADVANCED == state) { + WorkingSetMember* member = _ws->get(id); - // Maybe the child had an invalidation. We intersect RecordId(s) so we can't do anything - // with this WSM. - if (!member->hasLoc()) { - _ws->flagForReview(id); - return PlanStage::NEED_TIME; - } + // Maybe the child had an invalidation. We intersect RecordId(s) so we can't do anything + // with this WSM. + if (!member->hasLoc()) { + _ws->flagForReview(id); + return PlanStage::NEED_TIME; + } - verify(member->hasLoc()); + verify(member->hasLoc()); - if (member->loc == _targetLoc) { - // The front element has hit _targetLoc. Don't move it forward anymore/work on - // another element. - _workingTowardRep.pop(); - AndCommon::mergeFrom(_ws->get(_targetId), *member); - _ws->free(id); + if (member->loc == _targetLoc) { + // The front element has hit _targetLoc. Don't move it forward anymore/work on + // another element. + _workingTowardRep.pop(); + AndCommon::mergeFrom(_ws->get(_targetId), *member); + _ws->free(id); - if (0 == _workingTowardRep.size()) { - WorkingSetID toReturn = _targetId; + if (0 == _workingTowardRep.size()) { + WorkingSetID toReturn = _targetId; - _targetNode = numeric_limits<size_t>::max(); - _targetId = WorkingSet::INVALID_ID; - _targetLoc = RecordId(); + _targetNode = numeric_limits<size_t>::max(); + _targetId = WorkingSet::INVALID_ID; + _targetLoc = RecordId(); - *out = toReturn; - ++_commonStats.advanced; - return PlanStage::ADVANCED; - } - // More children need to be advanced to _targetLoc. - ++_commonStats.needTime; - return PlanStage::NEED_TIME; - } - else if (member->loc < _targetLoc) { - // The front element of _workingTowardRep hasn't hit the thing we're AND-ing with - // yet. Try again later. - _ws->free(id); - ++_commonStats.needTime; - return PlanStage::NEED_TIME; + *out = toReturn; + ++_commonStats.advanced; + return PlanStage::ADVANCED; } - else { - // member->loc > _targetLoc. - // _targetLoc wasn't successfully AND-ed with the other sub-plans. We toss it and - // try AND-ing with the next value. - _specificStats.failedAnd[_targetNode]++; - - _ws->free(_targetId); - _targetNode = workingChildNumber; - _targetLoc = member->loc; - _targetId = id; - _workingTowardRep = std::queue<size_t>(); - for (size_t i = 0; i < _children.size(); ++i) { - if (workingChildNumber != i) { - _workingTowardRep.push(i); - } + // More children need to be advanced to _targetLoc. + ++_commonStats.needTime; + return PlanStage::NEED_TIME; + } else if (member->loc < _targetLoc) { + // The front element of _workingTowardRep hasn't hit the thing we're AND-ing with + // yet. Try again later. + _ws->free(id); + ++_commonStats.needTime; + return PlanStage::NEED_TIME; + } else { + // member->loc > _targetLoc. + // _targetLoc wasn't successfully AND-ed with the other sub-plans. We toss it and + // try AND-ing with the next value. + _specificStats.failedAnd[_targetNode]++; + + _ws->free(_targetId); + _targetNode = workingChildNumber; + _targetLoc = member->loc; + _targetId = id; + _workingTowardRep = std::queue<size_t>(); + for (size_t i = 0; i < _children.size(); ++i) { + if (workingChildNumber != i) { + _workingTowardRep.push(i); } - // Need time to chase after the new _targetLoc. - ++_commonStats.needTime; - return PlanStage::NEED_TIME; } + // Need time to chase after the new _targetLoc. + ++_commonStats.needTime; + return PlanStage::NEED_TIME; } - else if (PlanStage::IS_EOF == state) { - _isEOF = true; - _ws->free(_targetId); - return state; + } else if (PlanStage::IS_EOF == state) { + _isEOF = true; + _ws->free(_targetId); + return state; + } else if (PlanStage::FAILURE == state || PlanStage::DEAD == state) { + *out = id; + // If a stage fails, it may create a status WSM to indicate why it + // failed, in which case 'id' is valid. If ID is invalid, we + // create our own error message. + if (WorkingSet::INVALID_ID == id) { + mongoutils::str::stream ss; + ss << "sorted AND stage failed to read in results from child " << workingChildNumber; + Status status(ErrorCodes::InternalError, ss); + *out = WorkingSetCommon::allocateStatusMember(_ws, status); } - else if (PlanStage::FAILURE == state || PlanStage::DEAD == state) { + _isEOF = true; + _ws->free(_targetId); + return state; + } else { + if (PlanStage::NEED_TIME == state) { + ++_commonStats.needTime; + } else if (PlanStage::NEED_YIELD == state) { + ++_commonStats.needYield; *out = id; - // If a stage fails, it may create a status WSM to indicate why it - // failed, in which case 'id' is valid. If ID is invalid, we - // create our own error message. - if (WorkingSet::INVALID_ID == id) { - mongoutils::str::stream ss; - ss << "sorted AND stage failed to read in results from child " << workingChildNumber; - Status status(ErrorCodes::InternalError, ss); - *out = WorkingSetCommon::allocateStatusMember( _ws, status); - } - _isEOF = true; - _ws->free(_targetId); - return state; } - else { - if (PlanStage::NEED_TIME == state) { - ++_commonStats.needTime; - } - else if (PlanStage::NEED_YIELD == state) { - ++_commonStats.needYield; - *out = id; - } - return state; - } + return state; } +} - void AndSortedStage::saveState() { - ++_commonStats.yields; +void AndSortedStage::saveState() { + ++_commonStats.yields; - for (size_t i = 0; i < _children.size(); ++i) { - _children[i]->saveState(); - } + for (size_t i = 0; i < _children.size(); ++i) { + _children[i]->saveState(); } +} - void AndSortedStage::restoreState(OperationContext* opCtx) { - ++_commonStats.unyields; +void AndSortedStage::restoreState(OperationContext* opCtx) { + ++_commonStats.unyields; - for (size_t i = 0; i < _children.size(); ++i) { - _children[i]->restoreState(opCtx); - } + for (size_t i = 0; i < _children.size(); ++i) { + _children[i]->restoreState(opCtx); } +} - void AndSortedStage::invalidate(OperationContext* txn, - const RecordId& dl, - InvalidationType type) { - ++_commonStats.invalidates; - - if (isEOF()) { return; } - - for (size_t i = 0; i < _children.size(); ++i) { - _children[i]->invalidate(txn, dl, type); - } - - if (dl == _targetLoc) { - // We're in the middle of moving children forward until they hit _targetLoc, which is no - // longer a valid target. If it's a deletion we can't AND it with anything, if it's a - // mutation the predicates implied by the AND may no longer be true. So no matter what, - // fetch it, flag for review, and find another _targetLoc. - ++_specificStats.flagged; +void AndSortedStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + ++_commonStats.invalidates; - // The RecordId could still be a valid result so flag it and save it for later. - WorkingSetCommon::fetchAndInvalidateLoc(txn, _ws->get(_targetId), _collection); - _ws->flagForReview(_targetId); + if (isEOF()) { + return; + } - _targetId = WorkingSet::INVALID_ID; - _targetNode = numeric_limits<size_t>::max(); - _targetLoc = RecordId(); - _workingTowardRep = std::queue<size_t>(); - } + for (size_t i = 0; i < _children.size(); ++i) { + _children[i]->invalidate(txn, dl, type); } - vector<PlanStage*> AndSortedStage::getChildren() const { - return _children; + if (dl == _targetLoc) { + // We're in the middle of moving children forward until they hit _targetLoc, which is no + // longer a valid target. If it's a deletion we can't AND it with anything, if it's a + // mutation the predicates implied by the AND may no longer be true. So no matter what, + // fetch it, flag for review, and find another _targetLoc. + ++_specificStats.flagged; + + // The RecordId could still be a valid result so flag it and save it for later. + WorkingSetCommon::fetchAndInvalidateLoc(txn, _ws->get(_targetId), _collection); + _ws->flagForReview(_targetId); + + _targetId = WorkingSet::INVALID_ID; + _targetNode = numeric_limits<size_t>::max(); + _targetLoc = RecordId(); + _workingTowardRep = std::queue<size_t>(); } +} - PlanStageStats* AndSortedStage::getStats() { - _commonStats.isEOF = isEOF(); +vector<PlanStage*> AndSortedStage::getChildren() const { + return _children; +} - unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_AND_SORTED)); - ret->specific.reset(new AndSortedStats(_specificStats)); - for (size_t i = 0; i < _children.size(); ++i) { - ret->children.push_back(_children[i]->getStats()); - } +PlanStageStats* AndSortedStage::getStats() { + _commonStats.isEOF = isEOF(); - return ret.release(); + unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_AND_SORTED)); + ret->specific.reset(new AndSortedStats(_specificStats)); + for (size_t i = 0; i < _children.size(); ++i) { + ret->children.push_back(_children[i]->getStats()); } - const CommonStats* AndSortedStage::getCommonStats() const { - return &_commonStats; - } + return ret.release(); +} - const SpecificStats* AndSortedStage::getSpecificStats() const { - return &_specificStats; - } +const CommonStats* AndSortedStage::getCommonStats() const { + return &_commonStats; +} + +const SpecificStats* AndSortedStage::getSpecificStats() const { + return &_specificStats; +} } // namespace mongo diff --git a/src/mongo/db/exec/and_sorted.h b/src/mongo/db/exec/and_sorted.h index 424cda280f6..3a25aa8456e 100644 --- a/src/mongo/db/exec/and_sorted.h +++ b/src/mongo/db/exec/and_sorted.h @@ -39,77 +39,79 @@ namespace mongo { - /** - * Reads from N children, each of which must have a valid RecordId. Assumes each child produces - * RecordIds in sorted order. Outputs the intersection of the RecordIds outputted by the - * children. - * - * Preconditions: Valid RecordId. More than one child. - * - * Any RecordId that we keep a reference to that is invalidated before we are able to return it - * is fetched and added to the WorkingSet as "flagged for further review." Because this stage - * operates with RecordIds, we are unable to evaluate the AND for the invalidated RecordId, and it - * must be fully matched later. - */ - class AndSortedStage : public PlanStage { - public: - AndSortedStage(WorkingSet* ws, const Collection* collection); - virtual ~AndSortedStage(); +/** + * Reads from N children, each of which must have a valid RecordId. Assumes each child produces + * RecordIds in sorted order. Outputs the intersection of the RecordIds outputted by the + * children. + * + * Preconditions: Valid RecordId. More than one child. + * + * Any RecordId that we keep a reference to that is invalidated before we are able to return it + * is fetched and added to the WorkingSet as "flagged for further review." Because this stage + * operates with RecordIds, we are unable to evaluate the AND for the invalidated RecordId, and it + * must be fully matched later. + */ +class AndSortedStage : public PlanStage { +public: + AndSortedStage(WorkingSet* ws, const Collection* collection); + virtual ~AndSortedStage(); - void addChild(PlanStage* child); + void addChild(PlanStage* child); - virtual StageState work(WorkingSetID* out); - virtual bool isEOF(); + virtual StageState work(WorkingSetID* out); + virtual bool isEOF(); - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - virtual std::vector<PlanStage*> getChildren() const; + virtual std::vector<PlanStage*> getChildren() const; - virtual StageType stageType() const { return STAGE_AND_SORTED; } + virtual StageType stageType() const { + return STAGE_AND_SORTED; + } - virtual PlanStageStats* getStats(); + virtual PlanStageStats* getStats(); - virtual const CommonStats* getCommonStats() const; + virtual const CommonStats* getCommonStats() const; - virtual const SpecificStats* getSpecificStats() const; + virtual const SpecificStats* getSpecificStats() const; - static const char* kStageType; + static const char* kStageType; - private: - // Find a node to AND against. - PlanStage::StageState getTargetLoc(WorkingSetID* out); +private: + // Find a node to AND against. + PlanStage::StageState getTargetLoc(WorkingSetID* out); - // Move a child which hasn't advanced to the target node forward. - // Returns the target node in 'out' if all children successfully advance to it. - PlanStage::StageState moveTowardTargetLoc(WorkingSetID* out); + // Move a child which hasn't advanced to the target node forward. + // Returns the target node in 'out' if all children successfully advance to it. + PlanStage::StageState moveTowardTargetLoc(WorkingSetID* out); - // Not owned by us. - const Collection* _collection; + // Not owned by us. + const Collection* _collection; - // Not owned by us. - WorkingSet* _ws; + // Not owned by us. + WorkingSet* _ws; - // Owned by us. - std::vector<PlanStage*> _children; + // Owned by us. + std::vector<PlanStage*> _children; - // The current node we're AND-ing against. - size_t _targetNode; - RecordId _targetLoc; - WorkingSetID _targetId; + // The current node we're AND-ing against. + size_t _targetNode; + RecordId _targetLoc; + WorkingSetID _targetId; - // Nodes we're moving forward until they hit the element we're AND-ing. - // Everything in here has not advanced to _targetLoc yet. - // These are indices into _children. - std::queue<size_t> _workingTowardRep; + // Nodes we're moving forward until they hit the element we're AND-ing. + // Everything in here has not advanced to _targetLoc yet. + // These are indices into _children. + std::queue<size_t> _workingTowardRep; - // If any child hits EOF or if we have any errors, we're EOF. - bool _isEOF; + // If any child hits EOF or if we have any errors, we're EOF. + bool _isEOF; - // Stats - CommonStats _commonStats; - AndSortedStats _specificStats; - }; + // Stats + CommonStats _commonStats; + AndSortedStats _specificStats; +}; } // namespace mongo diff --git a/src/mongo/db/exec/cached_plan.cpp b/src/mongo/db/exec/cached_plan.cpp index 3c512e01890..78894d28d35 100644 --- a/src/mongo/db/exec/cached_plan.cpp +++ b/src/mongo/db/exec/cached_plan.cpp @@ -49,338 +49,317 @@ namespace mongo { - // static - const char* CachedPlanStage::kStageType = "CACHED_PLAN"; - - CachedPlanStage::CachedPlanStage(OperationContext* txn, - Collection* collection, - WorkingSet* ws, - CanonicalQuery* cq, - const QueryPlannerParams& params, - size_t decisionWorks, - PlanStage* root) - : _txn(txn), - _collection(collection), - _ws(ws), - _canonicalQuery(cq), - _plannerParams(params), - _decisionWorks(decisionWorks), - _root(root), - _commonStats(kStageType) { - invariant(_collection); - } - - Status CachedPlanStage::pickBestPlan(PlanYieldPolicy* yieldPolicy) { - // Adds the amount of time taken by pickBestPlan() to executionTimeMillis. There's lots of - // execution work that happens here, so this is needed for the time accounting to - // make sense. - ScopedTimer timer(&_commonStats.executionTimeMillis); - - // If we work this many times during the trial period, then we will replan the - // query from scratch. - size_t maxWorksBeforeReplan = static_cast<size_t>(internalQueryCacheEvictionRatio - * _decisionWorks); - - // The trial period ends without replanning if the cached plan produces this many results. - size_t numResults = MultiPlanStage::getTrialPeriodNumToReturn(*_canonicalQuery); - - for (size_t i = 0; i < maxWorksBeforeReplan; ++i) { - // Might need to yield between calls to work due to the timer elapsing. - Status yieldStatus = tryYield(yieldPolicy); - if (!yieldStatus.isOK()) { - return yieldStatus; - } +// static +const char* CachedPlanStage::kStageType = "CACHED_PLAN"; + +CachedPlanStage::CachedPlanStage(OperationContext* txn, + Collection* collection, + WorkingSet* ws, + CanonicalQuery* cq, + const QueryPlannerParams& params, + size_t decisionWorks, + PlanStage* root) + : _txn(txn), + _collection(collection), + _ws(ws), + _canonicalQuery(cq), + _plannerParams(params), + _decisionWorks(decisionWorks), + _root(root), + _commonStats(kStageType) { + invariant(_collection); +} + +Status CachedPlanStage::pickBestPlan(PlanYieldPolicy* yieldPolicy) { + // Adds the amount of time taken by pickBestPlan() to executionTimeMillis. There's lots of + // execution work that happens here, so this is needed for the time accounting to + // make sense. + ScopedTimer timer(&_commonStats.executionTimeMillis); + + // If we work this many times during the trial period, then we will replan the + // query from scratch. + size_t maxWorksBeforeReplan = + static_cast<size_t>(internalQueryCacheEvictionRatio * _decisionWorks); + + // The trial period ends without replanning if the cached plan produces this many results. + size_t numResults = MultiPlanStage::getTrialPeriodNumToReturn(*_canonicalQuery); + + for (size_t i = 0; i < maxWorksBeforeReplan; ++i) { + // Might need to yield between calls to work due to the timer elapsing. + Status yieldStatus = tryYield(yieldPolicy); + if (!yieldStatus.isOK()) { + return yieldStatus; + } - WorkingSetID id = WorkingSet::INVALID_ID; - PlanStage::StageState state = _root->work(&id); + WorkingSetID id = WorkingSet::INVALID_ID; + PlanStage::StageState state = _root->work(&id); - if (PlanStage::ADVANCED == state) { - // Save result for later. - _results.push_back(id); + if (PlanStage::ADVANCED == state) { + // Save result for later. + _results.push_back(id); - if (_results.size() >= numResults) { - // Once a plan returns enough results, stop working. Update cache with stats - // from this run and return. - updatePlanCache(); - return Status::OK(); - } - } - else if (PlanStage::IS_EOF == state) { - // Cached plan hit EOF quickly enough. No need to replan. Update cache with stats + if (_results.size() >= numResults) { + // Once a plan returns enough results, stop working. Update cache with stats // from this run and return. updatePlanCache(); return Status::OK(); } - else if (PlanStage::NEED_YIELD == state) { - if (id == WorkingSet::INVALID_ID) { - if (!yieldPolicy->allowedToYield()) { - throw WriteConflictException(); - } - } - else { - WorkingSetMember* member = _ws->get(id); - invariant(member->hasFetcher()); - // Transfer ownership of the fetcher and yield. - _fetcher.reset(member->releaseFetcher()); - } - - if (yieldPolicy->allowedToYield()) { - yieldPolicy->forceYield(); - } - - Status yieldStatus = tryYield(yieldPolicy); - if (!yieldStatus.isOK()) { - return yieldStatus; + } else if (PlanStage::IS_EOF == state) { + // Cached plan hit EOF quickly enough. No need to replan. Update cache with stats + // from this run and return. + updatePlanCache(); + return Status::OK(); + } else if (PlanStage::NEED_YIELD == state) { + if (id == WorkingSet::INVALID_ID) { + if (!yieldPolicy->allowedToYield()) { + throw WriteConflictException(); } + } else { + WorkingSetMember* member = _ws->get(id); + invariant(member->hasFetcher()); + // Transfer ownership of the fetcher and yield. + _fetcher.reset(member->releaseFetcher()); } - else if (PlanStage::FAILURE == state) { - // On failure, fall back to replanning the whole query. We neither evict the - // existing cache entry nor cache the result of replanning. - BSONObj statusObj; - WorkingSetCommon::getStatusMemberObject(*_ws, id, &statusObj); - - LOG(1) << "Execution of cached plan failed, falling back to replan." - << " query: " - << _canonicalQuery->toStringShort() - << " planSummary: " - << Explain::getPlanSummary(_root.get()) - << " status: " - << statusObj; - - const bool shouldCache = false; - return replan(yieldPolicy, shouldCache); - } - else if (PlanStage::DEAD == state) { - BSONObj statusObj; - WorkingSetCommon::getStatusMemberObject(*_ws, id, &statusObj); - - LOG(1) << "Execution of cached plan failed: PlanStage died" - << ", query: " - << _canonicalQuery->toStringShort() - << " planSummary: " - << Explain::getPlanSummary(_root.get()) - << " status: " - << statusObj; - - return WorkingSetCommon::getMemberObjectStatus(statusObj); - } - else { - invariant(PlanStage::NEED_TIME == state); - } - } - // If we're here, the trial period took more than 'maxWorksBeforeReplan' work cycles. This - // plan is taking too long, so we replan from scratch. - LOG(1) << "Execution of cached plan required " - << maxWorksBeforeReplan - << " works, but was originally cached with only " - << _decisionWorks - << " works. Evicting cache entry and replanning query: " - << _canonicalQuery->toStringShort() - << " plan summary before replan: " - << Explain::getPlanSummary(_root.get()); - - const bool shouldCache = true; - return replan(yieldPolicy, shouldCache); - } + if (yieldPolicy->allowedToYield()) { + yieldPolicy->forceYield(); + } - Status CachedPlanStage::tryYield(PlanYieldPolicy* yieldPolicy) { - // These are the conditions which can cause us to yield: - // 1) The yield policy's timer elapsed, or - // 2) some stage requested a yield due to a document fetch, or - // 3) we need to yield and retry due to a WriteConflictException. - // In all cases, the actual yielding happens here. - if (yieldPolicy->shouldYield()) { - // Here's where we yield. - bool alive = yieldPolicy->yield(_fetcher.get()); - - if (!alive) { - return Status(ErrorCodes::OperationFailed, - "CachedPlanStage killed during plan selection"); + Status yieldStatus = tryYield(yieldPolicy); + if (!yieldStatus.isOK()) { + return yieldStatus; } + } else if (PlanStage::FAILURE == state) { + // On failure, fall back to replanning the whole query. We neither evict the + // existing cache entry nor cache the result of replanning. + BSONObj statusObj; + WorkingSetCommon::getStatusMemberObject(*_ws, id, &statusObj); + + LOG(1) << "Execution of cached plan failed, falling back to replan." + << " query: " << _canonicalQuery->toStringShort() + << " planSummary: " << Explain::getPlanSummary(_root.get()) + << " status: " << statusObj; + + const bool shouldCache = false; + return replan(yieldPolicy, shouldCache); + } else if (PlanStage::DEAD == state) { + BSONObj statusObj; + WorkingSetCommon::getStatusMemberObject(*_ws, id, &statusObj); + + LOG(1) << "Execution of cached plan failed: PlanStage died" + << ", query: " << _canonicalQuery->toStringShort() + << " planSummary: " << Explain::getPlanSummary(_root.get()) + << " status: " << statusObj; + + return WorkingSetCommon::getMemberObjectStatus(statusObj); + } else { + invariant(PlanStage::NEED_TIME == state); } - - // We're done using the fetcher, so it should be freed. We don't want to - // use the same RecordFetcher twice. - _fetcher.reset(); - - return Status::OK(); } - Status CachedPlanStage::replan(PlanYieldPolicy* yieldPolicy, bool shouldCache) { - // We're going to start over with a new plan. No need for only old buffered results. - _results.clear(); - - // Clear out the working set. We'll start with a fresh working set. - _ws->clear(); - - // Use the query planning module to plan the whole query. - std::vector<QuerySolution*> rawSolutions; - Status status = QueryPlanner::plan(*_canonicalQuery, _plannerParams, &rawSolutions); - if (!status.isOK()) { - return Status(ErrorCodes::BadValue, - str::stream() - << "error processing query: " << _canonicalQuery->toString() - << " planner returned error: " << status.reason()); + // If we're here, the trial period took more than 'maxWorksBeforeReplan' work cycles. This + // plan is taking too long, so we replan from scratch. + LOG(1) << "Execution of cached plan required " << maxWorksBeforeReplan + << " works, but was originally cached with only " << _decisionWorks + << " works. Evicting cache entry and replanning query: " + << _canonicalQuery->toStringShort() + << " plan summary before replan: " << Explain::getPlanSummary(_root.get()); + + const bool shouldCache = true; + return replan(yieldPolicy, shouldCache); +} + +Status CachedPlanStage::tryYield(PlanYieldPolicy* yieldPolicy) { + // These are the conditions which can cause us to yield: + // 1) The yield policy's timer elapsed, or + // 2) some stage requested a yield due to a document fetch, or + // 3) we need to yield and retry due to a WriteConflictException. + // In all cases, the actual yielding happens here. + if (yieldPolicy->shouldYield()) { + // Here's where we yield. + bool alive = yieldPolicy->yield(_fetcher.get()); + + if (!alive) { + return Status(ErrorCodes::OperationFailed, + "CachedPlanStage killed during plan selection"); } + } - OwnedPointerVector<QuerySolution> solutions(rawSolutions); + // We're done using the fetcher, so it should be freed. We don't want to + // use the same RecordFetcher twice. + _fetcher.reset(); - // We cannot figure out how to answer the query. Perhaps it requires an index - // we do not have? - if (0 == solutions.size()) { - return Status(ErrorCodes::BadValue, - str::stream() - << "error processing query: " - << _canonicalQuery->toString() - << " No query solutions"); - } + return Status::OK(); +} - if (1 == solutions.size()) { - // If there's only one solution, it won't get cached. Make sure to evict the existing - // cache entry if requested by the caller. - if (shouldCache) { - PlanCache* cache = _collection->infoCache()->getPlanCache(); - cache->remove(*_canonicalQuery); - } +Status CachedPlanStage::replan(PlanYieldPolicy* yieldPolicy, bool shouldCache) { + // We're going to start over with a new plan. No need for only old buffered results. + _results.clear(); - PlanStage* newRoot; - // Only one possible plan. Build the stages from the solution. - verify(StageBuilder::build(_txn, _collection, *solutions[0], _ws, &newRoot)); - _root.reset(newRoot); - _replannedQs.reset(solutions.popAndReleaseBack()); - return Status::OK(); - } + // Clear out the working set. We'll start with a fresh working set. + _ws->clear(); - // Many solutions. Create a MultiPlanStage to pick the best, update the cache, - // and so on. The working set will be shared by all candidate plans. - _root.reset(new MultiPlanStage(_txn, _collection, _canonicalQuery, shouldCache)); - MultiPlanStage* multiPlanStage = static_cast<MultiPlanStage*>(_root.get()); + // Use the query planning module to plan the whole query. + std::vector<QuerySolution*> rawSolutions; + Status status = QueryPlanner::plan(*_canonicalQuery, _plannerParams, &rawSolutions); + if (!status.isOK()) { + return Status(ErrorCodes::BadValue, + str::stream() << "error processing query: " << _canonicalQuery->toString() + << " planner returned error: " << status.reason()); + } - for (size_t ix = 0; ix < solutions.size(); ++ix) { - if (solutions[ix]->cacheData.get()) { - solutions[ix]->cacheData->indexFilterApplied = _plannerParams.indexFiltersApplied; - } + OwnedPointerVector<QuerySolution> solutions(rawSolutions); - PlanStage* nextPlanRoot; - verify(StageBuilder::build(_txn, _collection, *solutions[ix], _ws, &nextPlanRoot)); + // We cannot figure out how to answer the query. Perhaps it requires an index + // we do not have? + if (0 == solutions.size()) { + return Status(ErrorCodes::BadValue, + str::stream() << "error processing query: " << _canonicalQuery->toString() + << " No query solutions"); + } - // Takes ownership of 'solutions[ix]' and 'nextPlanRoot'. - multiPlanStage->addPlan(solutions.releaseAt(ix), nextPlanRoot, _ws); + if (1 == solutions.size()) { + // If there's only one solution, it won't get cached. Make sure to evict the existing + // cache entry if requested by the caller. + if (shouldCache) { + PlanCache* cache = _collection->infoCache()->getPlanCache(); + cache->remove(*_canonicalQuery); } - // Delegate to the MultiPlanStage's plan selection facility. - return multiPlanStage->pickBestPlan(yieldPolicy); - } - - bool CachedPlanStage::isEOF() { - return _results.empty() && _root->isEOF(); + PlanStage* newRoot; + // Only one possible plan. Build the stages from the solution. + verify(StageBuilder::build(_txn, _collection, *solutions[0], _ws, &newRoot)); + _root.reset(newRoot); + _replannedQs.reset(solutions.popAndReleaseBack()); + return Status::OK(); } - PlanStage::StageState CachedPlanStage::work(WorkingSetID* out) { - ++_commonStats.works; - - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); - - if (isEOF()) { return PlanStage::IS_EOF; } + // Many solutions. Create a MultiPlanStage to pick the best, update the cache, + // and so on. The working set will be shared by all candidate plans. + _root.reset(new MultiPlanStage(_txn, _collection, _canonicalQuery, shouldCache)); + MultiPlanStage* multiPlanStage = static_cast<MultiPlanStage*>(_root.get()); - // First exhaust any results buffered during the trial period. - if (!_results.empty()) { - *out = _results.front(); - _results.pop_front(); - _commonStats.advanced++; - return PlanStage::ADVANCED; + for (size_t ix = 0; ix < solutions.size(); ++ix) { + if (solutions[ix]->cacheData.get()) { + solutions[ix]->cacheData->indexFilterApplied = _plannerParams.indexFiltersApplied; } - // Nothing left in trial period buffer. - StageState childStatus = _root->work(out); + PlanStage* nextPlanRoot; + verify(StageBuilder::build(_txn, _collection, *solutions[ix], _ws, &nextPlanRoot)); - if (PlanStage::ADVANCED == childStatus) { - _commonStats.advanced++; - } - else if (PlanStage::NEED_YIELD == childStatus) { - _commonStats.needYield++; - } - else if (PlanStage::NEED_TIME == childStatus) { - _commonStats.needTime++; - } - - return childStatus; + // Takes ownership of 'solutions[ix]' and 'nextPlanRoot'. + multiPlanStage->addPlan(solutions.releaseAt(ix), nextPlanRoot, _ws); } - void CachedPlanStage::saveState() { - _txn = NULL; - ++_commonStats.yields; - _root->saveState(); - } + // Delegate to the MultiPlanStage's plan selection facility. + return multiPlanStage->pickBestPlan(yieldPolicy); +} - void CachedPlanStage::restoreState(OperationContext* opCtx) { - invariant(_txn == NULL); - _txn = opCtx; +bool CachedPlanStage::isEOF() { + return _results.empty() && _root->isEOF(); +} - ++_commonStats.unyields; - _root->restoreState(opCtx); - } +PlanStage::StageState CachedPlanStage::work(WorkingSetID* out) { + ++_commonStats.works; - void CachedPlanStage::invalidate(OperationContext* txn, - const RecordId& dl, - InvalidationType type) { - _root->invalidate(txn, dl, type); - ++_commonStats.invalidates; - - for (std::list<WorkingSetID>::iterator it = _results.begin(); it != _results.end(); ) { - WorkingSetMember* member = _ws->get(*it); - if (member->hasLoc() && member->loc == dl) { - std::list<WorkingSetID>::iterator next = it; - ++next; - WorkingSetCommon::fetchAndInvalidateLoc(txn, member, _collection); - _results.erase(it); - it = next; - } - else { - ++it; - } - } - } + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); - std::vector<PlanStage*> CachedPlanStage::getChildren() const { - return { _root.get() }; + if (isEOF()) { + return PlanStage::IS_EOF; } - PlanStageStats* CachedPlanStage::getStats() { - _commonStats.isEOF = isEOF(); - - std::unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_CACHED_PLAN)); - ret->specific.reset(new CachedPlanStats(_specificStats)); - ret->children.push_back(_root->getStats()); - - return ret.release(); + // First exhaust any results buffered during the trial period. + if (!_results.empty()) { + *out = _results.front(); + _results.pop_front(); + _commonStats.advanced++; + return PlanStage::ADVANCED; } - const CommonStats* CachedPlanStage::getCommonStats() const { - return &_commonStats; - } + // Nothing left in trial period buffer. + StageState childStatus = _root->work(out); - const SpecificStats* CachedPlanStage::getSpecificStats() const { - return &_specificStats; + if (PlanStage::ADVANCED == childStatus) { + _commonStats.advanced++; + } else if (PlanStage::NEED_YIELD == childStatus) { + _commonStats.needYield++; + } else if (PlanStage::NEED_TIME == childStatus) { + _commonStats.needTime++; } - void CachedPlanStage::updatePlanCache() { - std::unique_ptr<PlanCacheEntryFeedback> feedback(new PlanCacheEntryFeedback()); - feedback->stats.reset(getStats()); - feedback->score = PlanRanker::scoreTree(feedback->stats.get()); - - PlanCache* cache = _collection->infoCache()->getPlanCache(); - Status fbs = cache->feedback(*_canonicalQuery, feedback.release()); - if (!fbs.isOK()) { - LOG(5) << _canonicalQuery->ns() << ": Failed to update cache with feedback: " - << fbs.toString() << " - " - << "(query: " << _canonicalQuery->getQueryObj() - << "; sort: " << _canonicalQuery->getParsed().getSort() - << "; projection: " << _canonicalQuery->getParsed().getProj() - << ") is no longer in plan cache."; + return childStatus; +} + +void CachedPlanStage::saveState() { + _txn = NULL; + ++_commonStats.yields; + _root->saveState(); +} + +void CachedPlanStage::restoreState(OperationContext* opCtx) { + invariant(_txn == NULL); + _txn = opCtx; + + ++_commonStats.unyields; + _root->restoreState(opCtx); +} + +void CachedPlanStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + _root->invalidate(txn, dl, type); + ++_commonStats.invalidates; + + for (std::list<WorkingSetID>::iterator it = _results.begin(); it != _results.end();) { + WorkingSetMember* member = _ws->get(*it); + if (member->hasLoc() && member->loc == dl) { + std::list<WorkingSetID>::iterator next = it; + ++next; + WorkingSetCommon::fetchAndInvalidateLoc(txn, member, _collection); + _results.erase(it); + it = next; + } else { + ++it; } } +} + +std::vector<PlanStage*> CachedPlanStage::getChildren() const { + return {_root.get()}; +} + +PlanStageStats* CachedPlanStage::getStats() { + _commonStats.isEOF = isEOF(); + + std::unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_CACHED_PLAN)); + ret->specific.reset(new CachedPlanStats(_specificStats)); + ret->children.push_back(_root->getStats()); + + return ret.release(); +} + +const CommonStats* CachedPlanStage::getCommonStats() const { + return &_commonStats; +} + +const SpecificStats* CachedPlanStage::getSpecificStats() const { + return &_specificStats; +} + +void CachedPlanStage::updatePlanCache() { + std::unique_ptr<PlanCacheEntryFeedback> feedback(new PlanCacheEntryFeedback()); + feedback->stats.reset(getStats()); + feedback->score = PlanRanker::scoreTree(feedback->stats.get()); + + PlanCache* cache = _collection->infoCache()->getPlanCache(); + Status fbs = cache->feedback(*_canonicalQuery, feedback.release()); + if (!fbs.isOK()) { + LOG(5) << _canonicalQuery->ns() + << ": Failed to update cache with feedback: " << fbs.toString() << " - " + << "(query: " << _canonicalQuery->getQueryObj() + << "; sort: " << _canonicalQuery->getParsed().getSort() + << "; projection: " << _canonicalQuery->getParsed().getProj() + << ") is no longer in plan cache."; + } +} } // namespace mongo diff --git a/src/mongo/db/exec/cached_plan.h b/src/mongo/db/exec/cached_plan.h index ccc82d9cc7d..937b18a8ad2 100644 --- a/src/mongo/db/exec/cached_plan.h +++ b/src/mongo/db/exec/cached_plan.h @@ -42,118 +42,120 @@ namespace mongo { - class PlanYieldPolicy; +class PlanYieldPolicy; + +/** + * This stage outputs its mainChild, and possibly its backup child + * and also updates the cache. + * + * Preconditions: Valid RecordId. + * + */ +class CachedPlanStage : public PlanStage { +public: + CachedPlanStage(OperationContext* txn, + Collection* collection, + WorkingSet* ws, + CanonicalQuery* cq, + const QueryPlannerParams& params, + size_t decisionWorks, + PlanStage* root); + + virtual bool isEOF(); + + virtual StageState work(WorkingSetID* out); + + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + + virtual std::vector<PlanStage*> getChildren() const; + + virtual StageType stageType() const { + return STAGE_CACHED_PLAN; + } + + virtual PlanStageStats* getStats(); + + virtual const CommonStats* getCommonStats() const; + + virtual const SpecificStats* getSpecificStats() const; + + static const char* kStageType; /** - * This stage outputs its mainChild, and possibly its backup child - * and also updates the cache. + * Runs the cached plan for a trial period, yielding during the trial period according to + * 'yieldPolicy'. * - * Preconditions: Valid RecordId. + * Feedback from the trial period is passed to the plan cache. If the performance is lower + * than expected, the old plan is evicted and a new plan is selected from scratch (again + * yielding according to 'yieldPolicy'). Otherwise, the cached plan is run. + */ + Status pickBestPlan(PlanYieldPolicy* yieldPolicy); + +private: + /** + * Passes stats from the trial period run of the cached plan to the plan cache. * + * If the plan cache entry is deleted before we get a chance to update it, then this + * is a no-op. */ - class CachedPlanStage : public PlanStage { - public: - CachedPlanStage(OperationContext* txn, - Collection* collection, - WorkingSet* ws, - CanonicalQuery* cq, - const QueryPlannerParams& params, - size_t decisionWorks, - PlanStage* root); - - virtual bool isEOF(); - - virtual StageState work(WorkingSetID* out); - - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - - virtual std::vector<PlanStage*> getChildren() const; - - virtual StageType stageType() const { return STAGE_CACHED_PLAN; } - - virtual PlanStageStats* getStats(); - - virtual const CommonStats* getCommonStats() const; - - virtual const SpecificStats* getSpecificStats() const; - - static const char* kStageType; - - /** - * Runs the cached plan for a trial period, yielding during the trial period according to - * 'yieldPolicy'. - * - * Feedback from the trial period is passed to the plan cache. If the performance is lower - * than expected, the old plan is evicted and a new plan is selected from scratch (again - * yielding according to 'yieldPolicy'). Otherwise, the cached plan is run. - */ - Status pickBestPlan(PlanYieldPolicy* yieldPolicy); - - private: - /** - * Passes stats from the trial period run of the cached plan to the plan cache. - * - * If the plan cache entry is deleted before we get a chance to update it, then this - * is a no-op. - */ - void updatePlanCache(); - - /** - * Uses the QueryPlanner and the MultiPlanStage to re-generate candidate plans for this - * query and select a new winner. - * - * We fallback to a new plan if updatePlanCache() tells us that the performance was worse - * than anticipated during the trial period. - * - * We only write the result of re-planning to the plan cache if 'shouldCache' is true. - */ - Status replan(PlanYieldPolicy* yieldPolicy, bool shouldCache); - - /** - * May yield during the cached plan stage's trial period or replanning phases. - * - * Returns a non-OK status if the plan was killed during a yield. - */ - Status tryYield(PlanYieldPolicy* yieldPolicy); - - // Not owned. - OperationContext* _txn; - - // Not owned. Must be non-null. - Collection* _collection; - - // Not owned. - WorkingSet* _ws; - - // Not owned. - CanonicalQuery* _canonicalQuery; - - QueryPlannerParams _plannerParams; - - // The number of work cycles taken to decide on a winning plan when the plan was first - // cached. - size_t _decisionWorks; - - // If we fall back to re-planning the query, and there is just one resulting query solution, - // that solution is owned here. - std::unique_ptr<QuerySolution> _replannedQs; - - std::unique_ptr<PlanStage> _root; - - // Any results produced during trial period execution are kept here. - std::list<WorkingSetID> _results; - - // When a stage requests a yield for document fetch, it gives us back a RecordFetcher* - // to use to pull the record into memory. We take ownership of the RecordFetcher here, - // deleting it after we've had a chance to do the fetch. For timing-based yields, we - // just pass a NULL fetcher. - std::unique_ptr<RecordFetcher> _fetcher; - - // Stats - CommonStats _commonStats; - CachedPlanStats _specificStats; - }; + void updatePlanCache(); + + /** + * Uses the QueryPlanner and the MultiPlanStage to re-generate candidate plans for this + * query and select a new winner. + * + * We fallback to a new plan if updatePlanCache() tells us that the performance was worse + * than anticipated during the trial period. + * + * We only write the result of re-planning to the plan cache if 'shouldCache' is true. + */ + Status replan(PlanYieldPolicy* yieldPolicy, bool shouldCache); + + /** + * May yield during the cached plan stage's trial period or replanning phases. + * + * Returns a non-OK status if the plan was killed during a yield. + */ + Status tryYield(PlanYieldPolicy* yieldPolicy); + + // Not owned. + OperationContext* _txn; + + // Not owned. Must be non-null. + Collection* _collection; + + // Not owned. + WorkingSet* _ws; + + // Not owned. + CanonicalQuery* _canonicalQuery; + + QueryPlannerParams _plannerParams; + + // The number of work cycles taken to decide on a winning plan when the plan was first + // cached. + size_t _decisionWorks; + + // If we fall back to re-planning the query, and there is just one resulting query solution, + // that solution is owned here. + std::unique_ptr<QuerySolution> _replannedQs; + + std::unique_ptr<PlanStage> _root; + + // Any results produced during trial period execution are kept here. + std::list<WorkingSetID> _results; + + // When a stage requests a yield for document fetch, it gives us back a RecordFetcher* + // to use to pull the record into memory. We take ownership of the RecordFetcher here, + // deleting it after we've had a chance to do the fetch. For timing-based yields, we + // just pass a NULL fetcher. + std::unique_ptr<RecordFetcher> _fetcher; + + // Stats + CommonStats _commonStats; + CachedPlanStats _specificStats; +}; } // namespace mongo diff --git a/src/mongo/db/exec/collection_scan.cpp b/src/mongo/db/exec/collection_scan.cpp index 1a0c16c6b55..f0e09f31629 100644 --- a/src/mongo/db/exec/collection_scan.cpp +++ b/src/mongo/db/exec/collection_scan.cpp @@ -42,225 +42,221 @@ #include "mongo/util/fail_point_service.h" #include "mongo/util/log.h" -#include "mongo/db/client.h" // XXX-ERH +#include "mongo/db/client.h" // XXX-ERH namespace mongo { - using std::unique_ptr; - using std::vector; - - // static - const char* CollectionScan::kStageType = "COLLSCAN"; - - CollectionScan::CollectionScan(OperationContext* txn, - const CollectionScanParams& params, - WorkingSet* workingSet, - const MatchExpression* filter) - : _txn(txn), - _workingSet(workingSet), - _filter(filter), - _params(params), - _isDead(false), - _wsidForFetch(_workingSet->allocate()), - _commonStats(kStageType) { - // Explain reports the direction of the collection scan. - _specificStats.direction = params.direction; - - // We pre-allocate a WSM and use it to pass up fetch requests. This should never be used - // for anything other than passing up NEED_YIELD. We use the loc and owned obj state, but - // the loc isn't really pointing at any obj. The obj field of the WSM should never be used. - WorkingSetMember* member = _workingSet->get(_wsidForFetch); - member->state = WorkingSetMember::LOC_AND_OWNED_OBJ; +using std::unique_ptr; +using std::vector; + +// static +const char* CollectionScan::kStageType = "COLLSCAN"; + +CollectionScan::CollectionScan(OperationContext* txn, + const CollectionScanParams& params, + WorkingSet* workingSet, + const MatchExpression* filter) + : _txn(txn), + _workingSet(workingSet), + _filter(filter), + _params(params), + _isDead(false), + _wsidForFetch(_workingSet->allocate()), + _commonStats(kStageType) { + // Explain reports the direction of the collection scan. + _specificStats.direction = params.direction; + + // We pre-allocate a WSM and use it to pass up fetch requests. This should never be used + // for anything other than passing up NEED_YIELD. We use the loc and owned obj state, but + // the loc isn't really pointing at any obj. The obj field of the WSM should never be used. + WorkingSetMember* member = _workingSet->get(_wsidForFetch); + member->state = WorkingSetMember::LOC_AND_OWNED_OBJ; +} + +PlanStage::StageState CollectionScan::work(WorkingSetID* out) { + ++_commonStats.works; + + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); + + if (_isDead) { + Status status(ErrorCodes::InternalError, "CollectionScan died"); + *out = WorkingSetCommon::allocateStatusMember(_workingSet, status); + return PlanStage::DEAD; } - PlanStage::StageState CollectionScan::work(WorkingSetID* out) { - ++_commonStats.works; - - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); - - if (_isDead) { - Status status(ErrorCodes::InternalError, "CollectionScan died"); - *out = WorkingSetCommon::allocateStatusMember(_workingSet, status); - return PlanStage::DEAD; - } + if ((0 != _params.maxScan) && (_specificStats.docsTested >= _params.maxScan)) { + _commonStats.isEOF = true; + } - if ((0 != _params.maxScan) && (_specificStats.docsTested >= _params.maxScan)) { - _commonStats.isEOF = true; - } + if (_commonStats.isEOF) { + return PlanStage::IS_EOF; + } - if (_commonStats.isEOF) { return PlanStage::IS_EOF; } - - boost::optional<Record> record; - const bool needToMakeCursor = !_cursor; - try { - if (needToMakeCursor) { - const bool forward = _params.direction == CollectionScanParams::FORWARD; - _cursor = _params.collection->getCursor(_txn, forward); - - if (!_lastSeenId.isNull()) { - invariant(_params.tailable); - // Seek to where we were last time. If it no longer exists, mark us as dead - // since we want to signal an error rather than silently dropping data from the - // stream. This is related to the _lastSeenId handling in invalidate. Note that - // we want to return the record *after* this one since we have already returned - // this one. This is only possible in the tailing case because that is the only - // time we'd need to create a cursor after already getting a record out of it. - if (!_cursor->seekExact(_lastSeenId)) { - _isDead = true; - Status status(ErrorCodes::InternalError, - "CollectionScan died: Unexpected RecordId"); - *out = WorkingSetCommon::allocateStatusMember(_workingSet, status); - return PlanStage::DEAD; - } + boost::optional<Record> record; + const bool needToMakeCursor = !_cursor; + try { + if (needToMakeCursor) { + const bool forward = _params.direction == CollectionScanParams::FORWARD; + _cursor = _params.collection->getCursor(_txn, forward); + + if (!_lastSeenId.isNull()) { + invariant(_params.tailable); + // Seek to where we were last time. If it no longer exists, mark us as dead + // since we want to signal an error rather than silently dropping data from the + // stream. This is related to the _lastSeenId handling in invalidate. Note that + // we want to return the record *after* this one since we have already returned + // this one. This is only possible in the tailing case because that is the only + // time we'd need to create a cursor after already getting a record out of it. + if (!_cursor->seekExact(_lastSeenId)) { + _isDead = true; + Status status(ErrorCodes::InternalError, + "CollectionScan died: Unexpected RecordId"); + *out = WorkingSetCommon::allocateStatusMember(_workingSet, status); + return PlanStage::DEAD; } - - _commonStats.needTime++; - return PlanStage::NEED_TIME; - } - - if (_lastSeenId.isNull() && !_params.start.isNull()) { - record = _cursor->seekExact(_params.start); } - else { - // See if the record we're about to access is in memory. If not, pass a fetch - // request up. - if (auto fetcher = _cursor->fetcherForNext()) { - // Pass the RecordFetcher up. - WorkingSetMember* member = _workingSet->get(_wsidForFetch); - member->setFetcher(fetcher.release()); - *out = _wsidForFetch; - _commonStats.needYield++; - return PlanStage::NEED_YIELD; - } - record = _cursor->next(); - } - } - catch (const WriteConflictException& wce) { - // Leave us in a state to try again next time. - if (needToMakeCursor) - _cursor.reset(); - *out = WorkingSet::INVALID_ID; - return PlanStage::NEED_YIELD; + _commonStats.needTime++; + return PlanStage::NEED_TIME; } - if (!record) { - // We just hit EOF. If we are tailable and have already returned data, leave us in a - // state to pick up where we left off on the next call to work(). Otherwise EOF is - // permanent. - if (_params.tailable && !_lastSeenId.isNull()) { - _cursor.reset(); - } - else { - _commonStats.isEOF = true; + if (_lastSeenId.isNull() && !_params.start.isNull()) { + record = _cursor->seekExact(_params.start); + } else { + // See if the record we're about to access is in memory. If not, pass a fetch + // request up. + if (auto fetcher = _cursor->fetcherForNext()) { + // Pass the RecordFetcher up. + WorkingSetMember* member = _workingSet->get(_wsidForFetch); + member->setFetcher(fetcher.release()); + *out = _wsidForFetch; + _commonStats.needYield++; + return PlanStage::NEED_YIELD; } - - return PlanStage::IS_EOF; - } - - _lastSeenId = record->id; - - WorkingSetID id = _workingSet->allocate(); - WorkingSetMember* member = _workingSet->get(id); - member->loc = record->id; - member->obj = {_txn->recoveryUnit()->getSnapshotId(), record->data.releaseToBson()}; - member->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; - return returnIfMatches(member, id, out); + record = _cursor->next(); + } + } catch (const WriteConflictException& wce) { + // Leave us in a state to try again next time. + if (needToMakeCursor) + _cursor.reset(); + *out = WorkingSet::INVALID_ID; + return PlanStage::NEED_YIELD; } - PlanStage::StageState CollectionScan::returnIfMatches(WorkingSetMember* member, - WorkingSetID memberID, - WorkingSetID* out) { - ++_specificStats.docsTested; - - if (Filter::passes(member, _filter)) { - *out = memberID; - ++_commonStats.advanced; - return PlanStage::ADVANCED; - } - else { - _workingSet->free(memberID); - ++_commonStats.needTime; - return PlanStage::NEED_TIME; + if (!record) { + // We just hit EOF. If we are tailable and have already returned data, leave us in a + // state to pick up where we left off on the next call to work(). Otherwise EOF is + // permanent. + if (_params.tailable && !_lastSeenId.isNull()) { + _cursor.reset(); + } else { + _commonStats.isEOF = true; } + + return PlanStage::IS_EOF; } - bool CollectionScan::isEOF() { - return _commonStats.isEOF || _isDead; + _lastSeenId = record->id; + + WorkingSetID id = _workingSet->allocate(); + WorkingSetMember* member = _workingSet->get(id); + member->loc = record->id; + member->obj = {_txn->recoveryUnit()->getSnapshotId(), record->data.releaseToBson()}; + member->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; + + return returnIfMatches(member, id, out); +} + +PlanStage::StageState CollectionScan::returnIfMatches(WorkingSetMember* member, + WorkingSetID memberID, + WorkingSetID* out) { + ++_specificStats.docsTested; + + if (Filter::passes(member, _filter)) { + *out = memberID; + ++_commonStats.advanced; + return PlanStage::ADVANCED; + } else { + _workingSet->free(memberID); + ++_commonStats.needTime; + return PlanStage::NEED_TIME; } +} - void CollectionScan::invalidate(OperationContext* txn, - const RecordId& id, - InvalidationType type) { - ++_commonStats.invalidates; +bool CollectionScan::isEOF() { + return _commonStats.isEOF || _isDead; +} - // We don't care about mutations since we apply any filters to the result when we (possibly) - // return it. - if (INVALIDATION_DELETION != type) { - return; - } +void CollectionScan::invalidate(OperationContext* txn, const RecordId& id, InvalidationType type) { + ++_commonStats.invalidates; - // If we're here, 'id' is being deleted. + // We don't care about mutations since we apply any filters to the result when we (possibly) + // return it. + if (INVALIDATION_DELETION != type) { + return; + } - // Deletions can harm the underlying RecordCursor so we must pass them down. - if (_cursor) { - _cursor->invalidate(id); - } + // If we're here, 'id' is being deleted. - if (_params.tailable && id == _lastSeenId) { - // This means that deletes have caught up to the reader. We want to error in this case - // so readers don't miss potentially important data. - _isDead = true; - } + // Deletions can harm the underlying RecordCursor so we must pass them down. + if (_cursor) { + _cursor->invalidate(id); } - void CollectionScan::saveState() { - _txn = NULL; - ++_commonStats.yields; - if (_cursor) { - _cursor->savePositioned(); - } + if (_params.tailable && id == _lastSeenId) { + // This means that deletes have caught up to the reader. We want to error in this case + // so readers don't miss potentially important data. + _isDead = true; } +} - void CollectionScan::restoreState(OperationContext* opCtx) { - invariant(_txn == NULL); - _txn = opCtx; - ++_commonStats.unyields; - if (_cursor) { - if (!_cursor->restore(opCtx)) { - warning() << "Collection dropped or state deleted during yield of CollectionScan: " - << opCtx->getNS(); - _isDead = true; - } +void CollectionScan::saveState() { + _txn = NULL; + ++_commonStats.yields; + if (_cursor) { + _cursor->savePositioned(); + } +} + +void CollectionScan::restoreState(OperationContext* opCtx) { + invariant(_txn == NULL); + _txn = opCtx; + ++_commonStats.unyields; + if (_cursor) { + if (!_cursor->restore(opCtx)) { + warning() << "Collection dropped or state deleted during yield of CollectionScan: " + << opCtx->getNS(); + _isDead = true; } } - - vector<PlanStage*> CollectionScan::getChildren() const { - vector<PlanStage*> empty; - return empty; +} + +vector<PlanStage*> CollectionScan::getChildren() const { + vector<PlanStage*> empty; + return empty; +} + +PlanStageStats* CollectionScan::getStats() { + // Add a BSON representation of the filter to the stats tree, if there is one. + if (NULL != _filter) { + BSONObjBuilder bob; + _filter->toBSON(&bob); + _commonStats.filter = bob.obj(); } - PlanStageStats* CollectionScan::getStats() { - // Add a BSON representation of the filter to the stats tree, if there is one. - if (NULL != _filter) { - BSONObjBuilder bob; - _filter->toBSON(&bob); - _commonStats.filter = bob.obj(); - } + unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_COLLSCAN)); + ret->specific.reset(new CollectionScanStats(_specificStats)); + return ret.release(); +} - unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_COLLSCAN)); - ret->specific.reset(new CollectionScanStats(_specificStats)); - return ret.release(); - } - - const CommonStats* CollectionScan::getCommonStats() const { - return &_commonStats; - } +const CommonStats* CollectionScan::getCommonStats() const { + return &_commonStats; +} - const SpecificStats* CollectionScan::getSpecificStats() const { - return &_specificStats; - } +const SpecificStats* CollectionScan::getSpecificStats() const { + return &_specificStats; +} } // namespace mongo diff --git a/src/mongo/db/exec/collection_scan.h b/src/mongo/db/exec/collection_scan.h index ec19c5e22ee..ec3ffe63bea 100644 --- a/src/mongo/db/exec/collection_scan.h +++ b/src/mongo/db/exec/collection_scan.h @@ -37,75 +37,75 @@ namespace mongo { - class RecordCursor; - class WorkingSet; - class OperationContext; +class RecordCursor; +class WorkingSet; +class OperationContext; - /** - * Scans over a collection, starting at the RecordId provided in params and continuing until - * there are no more records in the collection. - * - * Preconditions: Valid RecordId. - */ - class CollectionScan : public PlanStage { - public: - CollectionScan(OperationContext* txn, - const CollectionScanParams& params, - WorkingSet* workingSet, - const MatchExpression* filter); +/** + * Scans over a collection, starting at the RecordId provided in params and continuing until + * there are no more records in the collection. + * + * Preconditions: Valid RecordId. + */ +class CollectionScan : public PlanStage { +public: + CollectionScan(OperationContext* txn, + const CollectionScanParams& params, + WorkingSet* workingSet, + const MatchExpression* filter); - virtual StageState work(WorkingSetID* out); - virtual bool isEOF(); + virtual StageState work(WorkingSetID* out); + virtual bool isEOF(); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); - virtual std::vector<PlanStage*> getChildren() const; + virtual std::vector<PlanStage*> getChildren() const; - virtual StageType stageType() const { return STAGE_COLLSCAN; } + virtual StageType stageType() const { + return STAGE_COLLSCAN; + } - virtual PlanStageStats* getStats(); + virtual PlanStageStats* getStats(); - virtual const CommonStats* getCommonStats() const; + virtual const CommonStats* getCommonStats() const; - virtual const SpecificStats* getSpecificStats() const; + virtual const SpecificStats* getSpecificStats() const; - static const char* kStageType; + static const char* kStageType; - private: - /** - * If the member (with id memberID) passes our filter, set *out to memberID and return that - * ADVANCED. Otherwise, free memberID and return NEED_TIME. - */ - StageState returnIfMatches(WorkingSetMember* member, - WorkingSetID memberID, - WorkingSetID* out); +private: + /** + * If the member (with id memberID) passes our filter, set *out to memberID and return that + * ADVANCED. Otherwise, free memberID and return NEED_TIME. + */ + StageState returnIfMatches(WorkingSetMember* member, WorkingSetID memberID, WorkingSetID* out); - // transactional context for read locks. Not owned by us - OperationContext* _txn; + // transactional context for read locks. Not owned by us + OperationContext* _txn; - // WorkingSet is not owned by us. - WorkingSet* _workingSet; + // WorkingSet is not owned by us. + WorkingSet* _workingSet; - // The filter is not owned by us. - const MatchExpression* _filter; + // The filter is not owned by us. + const MatchExpression* _filter; - std::unique_ptr<RecordCursor> _cursor; + std::unique_ptr<RecordCursor> _cursor; - CollectionScanParams _params; + CollectionScanParams _params; - bool _isDead; + bool _isDead; - RecordId _lastSeenId; // Null if nothing has been returned from _cursor yet. + RecordId _lastSeenId; // Null if nothing has been returned from _cursor yet. - // We allocate a working set member with this id on construction of the stage. It gets - // used for all fetch requests, changing the RecordId as appropriate. - const WorkingSetID _wsidForFetch; + // We allocate a working set member with this id on construction of the stage. It gets + // used for all fetch requests, changing the RecordId as appropriate. + const WorkingSetID _wsidForFetch; - // Stats - CommonStats _commonStats; - CollectionScanStats _specificStats; - }; + // Stats + CommonStats _commonStats; + CollectionScanStats _specificStats; +}; } // namespace mongo diff --git a/src/mongo/db/exec/collection_scan_common.h b/src/mongo/db/exec/collection_scan_common.h index 9b327598fa5..f5766e0a0d6 100644 --- a/src/mongo/db/exec/collection_scan_common.h +++ b/src/mongo/db/exec/collection_scan_common.h @@ -32,35 +32,32 @@ namespace mongo { - class Collection; +class Collection; - struct CollectionScanParams { - enum Direction { - FORWARD = 1, - BACKWARD = -1, - }; +struct CollectionScanParams { + enum Direction { + FORWARD = 1, + BACKWARD = -1, + }; - CollectionScanParams() : collection(NULL), - start(RecordId()), - direction(FORWARD), - tailable(false), - maxScan(0) { } + CollectionScanParams() + : collection(NULL), start(RecordId()), direction(FORWARD), tailable(false), maxScan(0) {} - // What collection? - // not owned - const Collection* collection; + // What collection? + // not owned + const Collection* collection; - // isNull by default. If you specify any value for this, you're responsible for the RecordId - // not being invalidated before the first call to work(...). - RecordId start; + // isNull by default. If you specify any value for this, you're responsible for the RecordId + // not being invalidated before the first call to work(...). + RecordId start; - Direction direction; + Direction direction; - // Do we want the scan to be 'tailable'? Only meaningful if the collection is capped. - bool tailable; + // Do we want the scan to be 'tailable'? Only meaningful if the collection is capped. + bool tailable; - // If non-zero, how many documents will we look at? - size_t maxScan; - }; + // If non-zero, how many documents will we look at? + size_t maxScan; +}; } // namespace mongo diff --git a/src/mongo/db/exec/count.cpp b/src/mongo/db/exec/count.cpp index 7534f7e1e44..092c36dfc03 100644 --- a/src/mongo/db/exec/count.cpp +++ b/src/mongo/db/exec/count.cpp @@ -36,185 +36,180 @@ namespace mongo { - using std::unique_ptr; - using std::vector; - - // static - const char* CountStage::kStageType = "COUNT"; - - CountStage::CountStage(OperationContext* txn, - Collection* collection, - const CountRequest& request, - WorkingSet* ws, - PlanStage* child) - : _txn(txn), - _collection(collection), - _request(request), - _leftToSkip(request.getSkip()), - _ws(ws), - _child(child), - _commonStats(kStageType) { } - - CountStage::~CountStage() { } - - bool CountStage::isEOF() { - if (_specificStats.trivialCount) { - return true; - } - - if (_request.getLimit() > 0 && _specificStats.nCounted >= _request.getLimit()) { - return true; - } - - return NULL != _child.get() && _child->isEOF(); +using std::unique_ptr; +using std::vector; + +// static +const char* CountStage::kStageType = "COUNT"; + +CountStage::CountStage(OperationContext* txn, + Collection* collection, + const CountRequest& request, + WorkingSet* ws, + PlanStage* child) + : _txn(txn), + _collection(collection), + _request(request), + _leftToSkip(request.getSkip()), + _ws(ws), + _child(child), + _commonStats(kStageType) {} + +CountStage::~CountStage() {} + +bool CountStage::isEOF() { + if (_specificStats.trivialCount) { + return true; } - void CountStage::trivialCount() { - invariant(_collection); - long long nCounted = _collection->numRecords(_txn); + if (_request.getLimit() > 0 && _specificStats.nCounted >= _request.getLimit()) { + return true; + } - if (0 != _request.getSkip()) { - nCounted -= _request.getSkip(); - if (nCounted < 0) { - nCounted = 0; - } - } + return NULL != _child.get() && _child->isEOF(); +} - long long limit = _request.getLimit(); - if (limit < 0) { - limit = -limit; - } +void CountStage::trivialCount() { + invariant(_collection); + long long nCounted = _collection->numRecords(_txn); - if (limit < nCounted && 0 != limit) { - nCounted = limit; + if (0 != _request.getSkip()) { + nCounted -= _request.getSkip(); + if (nCounted < 0) { + nCounted = 0; } + } - _specificStats.nCounted = nCounted; - _specificStats.nSkipped = _request.getSkip(); - _specificStats.trivialCount = true; + long long limit = _request.getLimit(); + if (limit < 0) { + limit = -limit; } - PlanStage::StageState CountStage::work(WorkingSetID* out) { - ++_commonStats.works; + if (limit < nCounted && 0 != limit) { + nCounted = limit; + } - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); + _specificStats.nCounted = nCounted; + _specificStats.nSkipped = _request.getSkip(); + _specificStats.trivialCount = true; +} - // This stage never returns a working set member. - *out = WorkingSet::INVALID_ID; +PlanStage::StageState CountStage::work(WorkingSetID* out) { + ++_commonStats.works; - // If we don't have a query and we have a non-NULL collection, then we can execute this - // as a trivial count (just ask the collection for how many records it has). - if (_request.getQuery().isEmpty() && NULL != _collection) { - trivialCount(); - return PlanStage::IS_EOF; - } + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); - if (isEOF()) { - _commonStats.isEOF = true; - return PlanStage::IS_EOF; - } - - // For non-trivial counts, we should always have a child stage from which we can retrieve - // results. - invariant(_child.get()); - WorkingSetID id = WorkingSet::INVALID_ID; - PlanStage::StageState state = _child->work(&id); - - if (PlanStage::IS_EOF == state) { - _commonStats.isEOF = true; - return PlanStage::IS_EOF; - } - else if (PlanStage::DEAD == state) { - return state; - } - else if (PlanStage::FAILURE == state || PlanStage::DEAD == state) { - *out = id; - // If a stage fails, it may create a status WSM to indicate why it failed, in which - // case 'id' is valid. If ID is invalid, we create our own error message. - if (WorkingSet::INVALID_ID == id) { - const std::string errmsg = "count stage failed to read result from child"; - Status status = Status(ErrorCodes::InternalError, errmsg); - *out = WorkingSetCommon::allocateStatusMember(_ws, status); - } - return state; - } - else if (PlanStage::ADVANCED == state) { - // We got a result. If we're still skipping, then decrement the number left to skip. - // Otherwise increment the count until we hit the limit. - if (_leftToSkip > 0) { - _leftToSkip--; - _specificStats.nSkipped++; - } - else { - _specificStats.nCounted++; - } - - // Count doesn't need the actual results, so we just discard any valid working - // set members that got returned from the child. - if (WorkingSet::INVALID_ID != id) { - _ws->free(id); - } - } - else if (PlanStage::NEED_YIELD == state) { - *out = id; - _commonStats.needYield++; - return PlanStage::NEED_YIELD; - } + // This stage never returns a working set member. + *out = WorkingSet::INVALID_ID; - _commonStats.needTime++; - return PlanStage::NEED_TIME; + // If we don't have a query and we have a non-NULL collection, then we can execute this + // as a trivial count (just ask the collection for how many records it has). + if (_request.getQuery().isEmpty() && NULL != _collection) { + trivialCount(); + return PlanStage::IS_EOF; } - void CountStage::saveState() { - _txn = NULL; - ++_commonStats.yields; - if (_child.get()) { - _child->saveState(); - } + if (isEOF()) { + _commonStats.isEOF = true; + return PlanStage::IS_EOF; } - void CountStage::restoreState(OperationContext* opCtx) { - invariant(_txn == NULL); - _txn = opCtx; - ++_commonStats.unyields; - if (_child.get()) { - _child->restoreState(opCtx); - } + // For non-trivial counts, we should always have a child stage from which we can retrieve + // results. + invariant(_child.get()); + WorkingSetID id = WorkingSet::INVALID_ID; + PlanStage::StageState state = _child->work(&id); + + if (PlanStage::IS_EOF == state) { + _commonStats.isEOF = true; + return PlanStage::IS_EOF; + } else if (PlanStage::DEAD == state) { + return state; + } else if (PlanStage::FAILURE == state || PlanStage::DEAD == state) { + *out = id; + // If a stage fails, it may create a status WSM to indicate why it failed, in which + // case 'id' is valid. If ID is invalid, we create our own error message. + if (WorkingSet::INVALID_ID == id) { + const std::string errmsg = "count stage failed to read result from child"; + Status status = Status(ErrorCodes::InternalError, errmsg); + *out = WorkingSetCommon::allocateStatusMember(_ws, status); + } + return state; + } else if (PlanStage::ADVANCED == state) { + // We got a result. If we're still skipping, then decrement the number left to skip. + // Otherwise increment the count until we hit the limit. + if (_leftToSkip > 0) { + _leftToSkip--; + _specificStats.nSkipped++; + } else { + _specificStats.nCounted++; + } + + // Count doesn't need the actual results, so we just discard any valid working + // set members that got returned from the child. + if (WorkingSet::INVALID_ID != id) { + _ws->free(id); + } + } else if (PlanStage::NEED_YIELD == state) { + *out = id; + _commonStats.needYield++; + return PlanStage::NEED_YIELD; } - void CountStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { - ++_commonStats.invalidates; - if (_child.get()) { - _child->invalidate(txn, dl, type); - } - } + _commonStats.needTime++; + return PlanStage::NEED_TIME; +} - vector<PlanStage*> CountStage::getChildren() const { - vector<PlanStage*> children; - if (_child.get()) { - children.push_back(_child.get()); - } - return children; +void CountStage::saveState() { + _txn = NULL; + ++_commonStats.yields; + if (_child.get()) { + _child->saveState(); } - - PlanStageStats* CountStage::getStats() { - _commonStats.isEOF = isEOF(); - unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_COUNT)); - CountStats* countStats = new CountStats(_specificStats); - ret->specific.reset(countStats); - if (_child.get()) { - ret->children.push_back(_child->getStats()); - } - return ret.release(); +} + +void CountStage::restoreState(OperationContext* opCtx) { + invariant(_txn == NULL); + _txn = opCtx; + ++_commonStats.unyields; + if (_child.get()) { + _child->restoreState(opCtx); } +} - const CommonStats* CountStage::getCommonStats() const { - return &_commonStats; +void CountStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + ++_commonStats.invalidates; + if (_child.get()) { + _child->invalidate(txn, dl, type); } +} - const SpecificStats* CountStage::getSpecificStats() const { - return &_specificStats; +vector<PlanStage*> CountStage::getChildren() const { + vector<PlanStage*> children; + if (_child.get()) { + children.push_back(_child.get()); } + return children; +} + +PlanStageStats* CountStage::getStats() { + _commonStats.isEOF = isEOF(); + unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_COUNT)); + CountStats* countStats = new CountStats(_specificStats); + ret->specific.reset(countStats); + if (_child.get()) { + ret->children.push_back(_child->getStats()); + } + return ret.release(); +} + +const CommonStats* CountStage::getCommonStats() const { + return &_commonStats; +} + +const SpecificStats* CountStage::getSpecificStats() const { + return &_specificStats; +} } // namespace mongo diff --git a/src/mongo/db/exec/count.h b/src/mongo/db/exec/count.h index 05bce99ac66..6f5a5f4e203 100644 --- a/src/mongo/db/exec/count.h +++ b/src/mongo/db/exec/count.h @@ -34,72 +34,74 @@ namespace mongo { - /** - * Stage used by the count command. This stage sits at the root of a plan tree - * and counts the number of results returned by its child stage. - * - * This should not be confused with the CountScan stage. CountScan is a special - * index access stage which can optimize index access for count operations in - * some cases. On the other hand, *every* count op has a CountStage at its root. - * - * Only returns NEED_TIME until hitting EOF. The count result can be obtained by examining - * the specific stats. - */ - class CountStage : public PlanStage { - public: - CountStage(OperationContext* txn, - Collection* collection, - const CountRequest& request, - WorkingSet* ws, - PlanStage* child); +/** + * Stage used by the count command. This stage sits at the root of a plan tree + * and counts the number of results returned by its child stage. + * + * This should not be confused with the CountScan stage. CountScan is a special + * index access stage which can optimize index access for count operations in + * some cases. On the other hand, *every* count op has a CountStage at its root. + * + * Only returns NEED_TIME until hitting EOF. The count result can be obtained by examining + * the specific stats. + */ +class CountStage : public PlanStage { +public: + CountStage(OperationContext* txn, + Collection* collection, + const CountRequest& request, + WorkingSet* ws, + PlanStage* child); - virtual ~CountStage(); + virtual ~CountStage(); - virtual bool isEOF(); - virtual StageState work(WorkingSetID* out); + virtual bool isEOF(); + virtual StageState work(WorkingSetID* out); - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - virtual std::vector<PlanStage*> getChildren() const; + virtual std::vector<PlanStage*> getChildren() const; - virtual StageType stageType() const { return STAGE_COUNT; } + virtual StageType stageType() const { + return STAGE_COUNT; + } - PlanStageStats* getStats(); + PlanStageStats* getStats(); - virtual const CommonStats* getCommonStats() const; + virtual const CommonStats* getCommonStats() const; - virtual const SpecificStats* getSpecificStats() const; + virtual const SpecificStats* getSpecificStats() const; - static const char* kStageType; + static const char* kStageType; - private: - /** - * Computes the count in the case of an empty query, applying the skip and - * limit if necessary. The result is stored in '_specificStats'. - */ - void trivialCount(); +private: + /** + * Computes the count in the case of an empty query, applying the skip and + * limit if necessary. The result is stored in '_specificStats'. + */ + void trivialCount(); - // Transactional context for read locks. Not owned by us. - OperationContext* _txn; + // Transactional context for read locks. Not owned by us. + OperationContext* _txn; - // The collection over which we are counting. - Collection* _collection; + // The collection over which we are counting. + Collection* _collection; - CountRequest _request; + CountRequest _request; - // The number of documents that we still need to skip. - long long _leftToSkip; + // The number of documents that we still need to skip. + long long _leftToSkip; - // The working set used to pass intermediate results between stages. Not owned - // by us. - WorkingSet* _ws; + // The working set used to pass intermediate results between stages. Not owned + // by us. + WorkingSet* _ws; - std::unique_ptr<PlanStage> _child; + std::unique_ptr<PlanStage> _child; - CommonStats _commonStats; - CountStats _specificStats; - }; + CommonStats _commonStats; + CountStats _specificStats; +}; } // namespace mongo diff --git a/src/mongo/db/exec/count_scan.cpp b/src/mongo/db/exec/count_scan.cpp index c002e72e9ca..23499102147 100644 --- a/src/mongo/db/exec/count_scan.cpp +++ b/src/mongo/db/exec/count_scan.cpp @@ -34,149 +34,148 @@ namespace mongo { - using std::unique_ptr; - using std::vector; - - // static - const char* CountScan::kStageType = "COUNT_SCAN"; - - CountScan::CountScan(OperationContext* txn, - const CountScanParams& params, - WorkingSet* workingSet) - : _txn(txn), - _workingSet(workingSet), - _descriptor(params.descriptor), - _iam(params.descriptor->getIndexCatalog()->getIndex(params.descriptor)), - _shouldDedup(params.descriptor->isMultikey(txn)), - _params(params), - _commonStats(kStageType) { - _specificStats.keyPattern = _params.descriptor->keyPattern(); - _specificStats.indexName = _params.descriptor->indexName(); - _specificStats.isMultiKey = _params.descriptor->isMultikey(txn); - _specificStats.isUnique = _params.descriptor->unique(); - _specificStats.isSparse = _params.descriptor->isSparse(); - _specificStats.isPartial = _params.descriptor->isPartial(); - _specificStats.indexVersion = _params.descriptor->version(); - - // endKey must be after startKey in index order since we only do forward scans. - dassert(_params.startKey.woCompare(_params.endKey, - Ordering::make(params.descriptor->keyPattern()), - /*compareFieldNames*/false) <= 0); - } - - - PlanStage::StageState CountScan::work(WorkingSetID* out) { - ++_commonStats.works; - if (_commonStats.isEOF) return PlanStage::IS_EOF; - - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); - - boost::optional<IndexKeyEntry> entry; - const bool needInit = !_cursor; - try { - // We don't care about the keys. - const auto kWantLoc = SortedDataInterface::Cursor::kWantLoc; - - if (needInit) { - // First call to work(). Perform cursor init. - _cursor = _iam->newCursor(_txn); - _cursor->setEndPosition(_params.endKey, _params.endKeyInclusive); - - entry = _cursor->seek(_params.startKey, _params.startKeyInclusive, kWantLoc); - } - else { - entry = _cursor->next(kWantLoc); - } - } - catch (const WriteConflictException& wce) { - if (needInit) { - // Release our cursor and try again next time. - _cursor.reset(); - } - *out = WorkingSet::INVALID_ID; - return PlanStage::NEED_YIELD; +using std::unique_ptr; +using std::vector; + +// static +const char* CountScan::kStageType = "COUNT_SCAN"; + +CountScan::CountScan(OperationContext* txn, const CountScanParams& params, WorkingSet* workingSet) + : _txn(txn), + _workingSet(workingSet), + _descriptor(params.descriptor), + _iam(params.descriptor->getIndexCatalog()->getIndex(params.descriptor)), + _shouldDedup(params.descriptor->isMultikey(txn)), + _params(params), + _commonStats(kStageType) { + _specificStats.keyPattern = _params.descriptor->keyPattern(); + _specificStats.indexName = _params.descriptor->indexName(); + _specificStats.isMultiKey = _params.descriptor->isMultikey(txn); + _specificStats.isUnique = _params.descriptor->unique(); + _specificStats.isSparse = _params.descriptor->isSparse(); + _specificStats.isPartial = _params.descriptor->isPartial(); + _specificStats.indexVersion = _params.descriptor->version(); + + // endKey must be after startKey in index order since we only do forward scans. + dassert(_params.startKey.woCompare(_params.endKey, + Ordering::make(params.descriptor->keyPattern()), + /*compareFieldNames*/ false) <= 0); +} + + +PlanStage::StageState CountScan::work(WorkingSetID* out) { + ++_commonStats.works; + if (_commonStats.isEOF) + return PlanStage::IS_EOF; + + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); + + boost::optional<IndexKeyEntry> entry; + const bool needInit = !_cursor; + try { + // We don't care about the keys. + const auto kWantLoc = SortedDataInterface::Cursor::kWantLoc; + + if (needInit) { + // First call to work(). Perform cursor init. + _cursor = _iam->newCursor(_txn); + _cursor->setEndPosition(_params.endKey, _params.endKeyInclusive); + + entry = _cursor->seek(_params.startKey, _params.startKeyInclusive, kWantLoc); + } else { + entry = _cursor->next(kWantLoc); } - - ++_specificStats.keysExamined; - - if (!entry) { - _commonStats.isEOF = true; + } catch (const WriteConflictException& wce) { + if (needInit) { + // Release our cursor and try again next time. _cursor.reset(); - return PlanStage::IS_EOF; - } - - if (_shouldDedup && !_returned.insert(entry->loc).second) { - // *loc was already in _returned. - ++_commonStats.needTime; - return PlanStage::NEED_TIME; } - *out = WorkingSet::INVALID_ID; - ++_commonStats.advanced; - return PlanStage::ADVANCED; + return PlanStage::NEED_YIELD; } - bool CountScan::isEOF() { - return _commonStats.isEOF; - } + ++_specificStats.keysExamined; - void CountScan::saveState() { - _txn = NULL; - ++_commonStats.yields; - if (_cursor) _cursor->savePositioned(); + if (!entry) { + _commonStats.isEOF = true; + _cursor.reset(); + return PlanStage::IS_EOF; } - void CountScan::restoreState(OperationContext* opCtx) { - invariant(_txn == NULL); - _txn = opCtx; - ++_commonStats.unyields; - - if (_cursor) _cursor->restore(opCtx); - - // This can change during yielding. - // TODO this isn't sufficient. See SERVER-17678. - _shouldDedup = _descriptor->isMultikey(_txn); + if (_shouldDedup && !_returned.insert(entry->loc).second) { + // *loc was already in _returned. + ++_commonStats.needTime; + return PlanStage::NEED_TIME; } - void CountScan::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { - ++_commonStats.invalidates; - - // The only state we're responsible for holding is what RecordIds to drop. If a document - // mutates the underlying index cursor will deal with it. - if (INVALIDATION_MUTATION == type) { - return; - } - - // If we see this RecordId again, it may not be the same document it was before, so we want - // to return it if we see it again. - unordered_set<RecordId, RecordId::Hasher>::iterator it = _returned.find(dl); - if (it != _returned.end()) { - _returned.erase(it); - } + *out = WorkingSet::INVALID_ID; + ++_commonStats.advanced; + return PlanStage::ADVANCED; +} + +bool CountScan::isEOF() { + return _commonStats.isEOF; +} + +void CountScan::saveState() { + _txn = NULL; + ++_commonStats.yields; + if (_cursor) + _cursor->savePositioned(); +} + +void CountScan::restoreState(OperationContext* opCtx) { + invariant(_txn == NULL); + _txn = opCtx; + ++_commonStats.unyields; + + if (_cursor) + _cursor->restore(opCtx); + + // This can change during yielding. + // TODO this isn't sufficient. See SERVER-17678. + _shouldDedup = _descriptor->isMultikey(_txn); +} + +void CountScan::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + ++_commonStats.invalidates; + + // The only state we're responsible for holding is what RecordIds to drop. If a document + // mutates the underlying index cursor will deal with it. + if (INVALIDATION_MUTATION == type) { + return; } - vector<PlanStage*> CountScan::getChildren() const { - vector<PlanStage*> empty; - return empty; + // If we see this RecordId again, it may not be the same document it was before, so we want + // to return it if we see it again. + unordered_set<RecordId, RecordId::Hasher>::iterator it = _returned.find(dl); + if (it != _returned.end()) { + _returned.erase(it); } +} - PlanStageStats* CountScan::getStats() { - unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_COUNT_SCAN)); +vector<PlanStage*> CountScan::getChildren() const { + vector<PlanStage*> empty; + return empty; +} - CountScanStats* countStats = new CountScanStats(_specificStats); - countStats->keyPattern = _specificStats.keyPattern.getOwned(); - ret->specific.reset(countStats); +PlanStageStats* CountScan::getStats() { + unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_COUNT_SCAN)); - return ret.release(); - } + CountScanStats* countStats = new CountScanStats(_specificStats); + countStats->keyPattern = _specificStats.keyPattern.getOwned(); + ret->specific.reset(countStats); - const CommonStats* CountScan::getCommonStats() const { - return &_commonStats; - } + return ret.release(); +} - const SpecificStats* CountScan::getSpecificStats() const { - return &_specificStats; - } +const CommonStats* CountScan::getCommonStats() const { + return &_commonStats; +} + +const SpecificStats* CountScan::getSpecificStats() const { + return &_specificStats; +} } // namespace mongo diff --git a/src/mongo/db/exec/count_scan.h b/src/mongo/db/exec/count_scan.h index e63f672637d..e00fa05e2f8 100644 --- a/src/mongo/db/exec/count_scan.h +++ b/src/mongo/db/exec/count_scan.h @@ -39,75 +39,77 @@ namespace mongo { - class IndexAccessMethod; - class IndexDescriptor; - class WorkingSet; +class IndexAccessMethod; +class IndexDescriptor; +class WorkingSet; - struct CountScanParams { - CountScanParams() : descriptor(NULL) { } +struct CountScanParams { + CountScanParams() : descriptor(NULL) {} - // What index are we traversing? - const IndexDescriptor* descriptor; + // What index are we traversing? + const IndexDescriptor* descriptor; - BSONObj startKey; - bool startKeyInclusive; + BSONObj startKey; + bool startKeyInclusive; - BSONObj endKey; - bool endKeyInclusive; - }; + BSONObj endKey; + bool endKeyInclusive; +}; - /** - * Used by the count command. Scans an index from a start key to an end key. Does not create - * any WorkingSetMember(s) for any of the data, instead returning ADVANCED to indicate to the - * caller that another result should be counted. - * - * Only created through the getExecutorCount path, as count is the only operation that doesn't - * care about its data. - */ - class CountScan : public PlanStage { - public: - CountScan(OperationContext* txn, const CountScanParams& params, WorkingSet* workingSet); - virtual ~CountScan() { } +/** + * Used by the count command. Scans an index from a start key to an end key. Does not create + * any WorkingSetMember(s) for any of the data, instead returning ADVANCED to indicate to the + * caller that another result should be counted. + * + * Only created through the getExecutorCount path, as count is the only operation that doesn't + * care about its data. + */ +class CountScan : public PlanStage { +public: + CountScan(OperationContext* txn, const CountScanParams& params, WorkingSet* workingSet); + virtual ~CountScan() {} - virtual StageState work(WorkingSetID* out); - virtual bool isEOF(); - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + virtual StageState work(WorkingSetID* out); + virtual bool isEOF(); + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - virtual std::vector<PlanStage*> getChildren() const; + virtual std::vector<PlanStage*> getChildren() const; - virtual StageType stageType() const { return STAGE_COUNT_SCAN; } + virtual StageType stageType() const { + return STAGE_COUNT_SCAN; + } - virtual PlanStageStats* getStats(); + virtual PlanStageStats* getStats(); - virtual const CommonStats* getCommonStats() const; + virtual const CommonStats* getCommonStats() const; - virtual const SpecificStats* getSpecificStats() const; + virtual const SpecificStats* getSpecificStats() const; - static const char* kStageType; + static const char* kStageType; - private: - // transactional context for read locks. Not owned by us - OperationContext* _txn; +private: + // transactional context for read locks. Not owned by us + OperationContext* _txn; - // The WorkingSet we annotate with results. Not owned by us. - WorkingSet* _workingSet; + // The WorkingSet we annotate with results. Not owned by us. + WorkingSet* _workingSet; - // Index access. Both pointers below are owned by Collection -> IndexCatalog. - const IndexDescriptor* _descriptor; - const IndexAccessMethod* _iam; + // Index access. Both pointers below are owned by Collection -> IndexCatalog. + const IndexDescriptor* _descriptor; + const IndexAccessMethod* _iam; - std::unique_ptr<SortedDataInterface::Cursor> _cursor; + std::unique_ptr<SortedDataInterface::Cursor> _cursor; - // Could our index have duplicates? If so, we use _returned to dedup. - bool _shouldDedup; - unordered_set<RecordId, RecordId::Hasher> _returned; + // Could our index have duplicates? If so, we use _returned to dedup. + bool _shouldDedup; + unordered_set<RecordId, RecordId::Hasher> _returned; - CountScanParams _params; + CountScanParams _params; - CommonStats _commonStats; - CountScanStats _specificStats; - }; + CommonStats _commonStats; + CountScanStats _specificStats; +}; } // namespace mongo diff --git a/src/mongo/db/exec/delete.cpp b/src/mongo/db/exec/delete.cpp index 787f3f244bf..5831b44e86a 100644 --- a/src/mongo/db/exec/delete.cpp +++ b/src/mongo/db/exec/delete.cpp @@ -45,274 +45,271 @@ namespace mongo { - using std::unique_ptr; - using std::vector; - - // static - const char* DeleteStage::kStageType = "DELETE"; - - DeleteStage::DeleteStage(OperationContext* txn, - const DeleteStageParams& params, - WorkingSet* ws, - Collection* collection, - PlanStage* child) - : _txn(txn), - _params(params), - _ws(ws), - _collection(collection), - _child(child), - _idRetrying(WorkingSet::INVALID_ID), - _idReturning(WorkingSet::INVALID_ID), - _commonStats(kStageType) { } - - DeleteStage::~DeleteStage() {} - - bool DeleteStage::isEOF() { - if (!_collection) { - return true; - } - if (!_params.isMulti && _specificStats.docsDeleted > 0) { - return true; - } - return _idRetrying == WorkingSet::INVALID_ID - && _idReturning == WorkingSet::INVALID_ID - && _child->isEOF(); +using std::unique_ptr; +using std::vector; + +// static +const char* DeleteStage::kStageType = "DELETE"; + +DeleteStage::DeleteStage(OperationContext* txn, + const DeleteStageParams& params, + WorkingSet* ws, + Collection* collection, + PlanStage* child) + : _txn(txn), + _params(params), + _ws(ws), + _collection(collection), + _child(child), + _idRetrying(WorkingSet::INVALID_ID), + _idReturning(WorkingSet::INVALID_ID), + _commonStats(kStageType) {} + +DeleteStage::~DeleteStage() {} + +bool DeleteStage::isEOF() { + if (!_collection) { + return true; } + if (!_params.isMulti && _specificStats.docsDeleted > 0) { + return true; + } + return _idRetrying == WorkingSet::INVALID_ID && _idReturning == WorkingSet::INVALID_ID && + _child->isEOF(); +} - PlanStage::StageState DeleteStage::work(WorkingSetID* out) { - ++_commonStats.works; - - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); +PlanStage::StageState DeleteStage::work(WorkingSetID* out) { + ++_commonStats.works; - if (isEOF()) { return PlanStage::IS_EOF; } - invariant(_collection); // If isEOF() returns false, we must have a collection. + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); - // It is possible that after a delete was executed, a WriteConflictException occurred - // and prevented us from returning ADVANCED with the old version of the document. - if (_idReturning != WorkingSet::INVALID_ID) { - // We should only get here if we were trying to return something before. - invariant(_params.returnDeleted); + if (isEOF()) { + return PlanStage::IS_EOF; + } + invariant(_collection); // If isEOF() returns false, we must have a collection. - WorkingSetMember* member = _ws->get(_idReturning); - invariant(member->state == WorkingSetMember::OWNED_OBJ); + // It is possible that after a delete was executed, a WriteConflictException occurred + // and prevented us from returning ADVANCED with the old version of the document. + if (_idReturning != WorkingSet::INVALID_ID) { + // We should only get here if we were trying to return something before. + invariant(_params.returnDeleted); - *out = _idReturning; - _idReturning = WorkingSet::INVALID_ID; - ++_commonStats.advanced; - return PlanStage::ADVANCED; - } + WorkingSetMember* member = _ws->get(_idReturning); + invariant(member->state == WorkingSetMember::OWNED_OBJ); - // Either retry the last WSM we worked on or get a new one from our child. - WorkingSetID id; - StageState status; - if (_idRetrying == WorkingSet::INVALID_ID) { - status = _child->work(&id); - } - else { - status = ADVANCED; - id = _idRetrying; - _idRetrying = WorkingSet::INVALID_ID; - } + *out = _idReturning; + _idReturning = WorkingSet::INVALID_ID; + ++_commonStats.advanced; + return PlanStage::ADVANCED; + } - if (PlanStage::ADVANCED == status) { - WorkingSetMember* member = _ws->get(id); + // Either retry the last WSM we worked on or get a new one from our child. + WorkingSetID id; + StageState status; + if (_idRetrying == WorkingSet::INVALID_ID) { + status = _child->work(&id); + } else { + status = ADVANCED; + id = _idRetrying; + _idRetrying = WorkingSet::INVALID_ID; + } - // We want to free this member when we return, unless we need to retry it. - ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id); + if (PlanStage::ADVANCED == status) { + WorkingSetMember* member = _ws->get(id); - if (!member->hasLoc()) { - // We expect to be here because of an invalidation causing a force-fetch, and - // doc-locking storage engines do not issue invalidations. - ++_specificStats.nInvalidateSkips; - ++_commonStats.needTime; - return PlanStage::NEED_TIME; - } - RecordId rloc = member->loc; - // Deletes can't have projections. This means that covering analysis will always add - // a fetch. We should always get fetched data, and never just key data. - invariant(member->hasObj()); + // We want to free this member when we return, unless we need to retry it. + ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id); - try { - // If the snapshot changed, then we have to make sure we have the latest copy of the - // doc and that it still matches. - std::unique_ptr<RecordCursor> cursor; - if (_txn->recoveryUnit()->getSnapshotId() != member->obj.snapshotId()) { - cursor = _collection->getCursor(_txn); - if (!WorkingSetCommon::fetch(_txn, member, cursor)) { - // Doc is already deleted. Nothing more to do. - ++_commonStats.needTime; - return PlanStage::NEED_TIME; - } - - // Make sure the re-fetched doc still matches the predicate. - if (_params.canonicalQuery && - !_params.canonicalQuery->root()->matchesBSON(member->obj.value(), NULL)) { - // Doesn't match. - ++_commonStats.needTime; - return PlanStage::NEED_TIME; - } + if (!member->hasLoc()) { + // We expect to be here because of an invalidation causing a force-fetch, and + // doc-locking storage engines do not issue invalidations. + ++_specificStats.nInvalidateSkips; + ++_commonStats.needTime; + return PlanStage::NEED_TIME; + } + RecordId rloc = member->loc; + // Deletes can't have projections. This means that covering analysis will always add + // a fetch. We should always get fetched data, and never just key data. + invariant(member->hasObj()); + + try { + // If the snapshot changed, then we have to make sure we have the latest copy of the + // doc and that it still matches. + std::unique_ptr<RecordCursor> cursor; + if (_txn->recoveryUnit()->getSnapshotId() != member->obj.snapshotId()) { + cursor = _collection->getCursor(_txn); + if (!WorkingSetCommon::fetch(_txn, member, cursor)) { + // Doc is already deleted. Nothing more to do. + ++_commonStats.needTime; + return PlanStage::NEED_TIME; } - // TODO: Do we want to buffer docs and delete them in a group rather than - // saving/restoring state repeatedly? - - try { - _child->saveState(); - if (supportsDocLocking()) { - // Doc-locking engines require this after saveState() since they don't use - // invalidations. - WorkingSetCommon::prepareForSnapshotChange(_ws); - } - } - catch ( const WriteConflictException& wce ) { - std::terminate(); + // Make sure the re-fetched doc still matches the predicate. + if (_params.canonicalQuery && + !_params.canonicalQuery->root()->matchesBSON(member->obj.value(), NULL)) { + // Doesn't match. + ++_commonStats.needTime; + return PlanStage::NEED_TIME; } + } - if (_params.returnDeleted) { - // Save a copy of the document that is about to get deleted. - BSONObj deletedDoc = member->obj.value(); - member->obj.setValue(deletedDoc.getOwned()); - member->loc = RecordId(); - member->state = WorkingSetMember::OWNED_OBJ; + // TODO: Do we want to buffer docs and delete them in a group rather than + // saving/restoring state repeatedly? + + try { + _child->saveState(); + if (supportsDocLocking()) { + // Doc-locking engines require this after saveState() since they don't use + // invalidations. + WorkingSetCommon::prepareForSnapshotChange(_ws); } + } catch (const WriteConflictException& wce) { + std::terminate(); + } - // Do the write, unless this is an explain. - if (!_params.isExplain) { - WriteUnitOfWork wunit(_txn); + if (_params.returnDeleted) { + // Save a copy of the document that is about to get deleted. + BSONObj deletedDoc = member->obj.value(); + member->obj.setValue(deletedDoc.getOwned()); + member->loc = RecordId(); + member->state = WorkingSetMember::OWNED_OBJ; + } - const bool deleteCappedOK = false; - const bool deleteNoWarn = false; - BSONObj deletedId; + // Do the write, unless this is an explain. + if (!_params.isExplain) { + WriteUnitOfWork wunit(_txn); - _collection->deleteDocument(_txn, rloc, deleteCappedOK, deleteNoWarn, - _params.shouldCallLogOp ? &deletedId : NULL); + const bool deleteCappedOK = false; + const bool deleteNoWarn = false; + BSONObj deletedId; - wunit.commit(); - } + _collection->deleteDocument(_txn, + rloc, + deleteCappedOK, + deleteNoWarn, + _params.shouldCallLogOp ? &deletedId : NULL); - ++_specificStats.docsDeleted; - } - catch ( const WriteConflictException& wce ) { - _idRetrying = id; - memberFreer.Dismiss(); // Keep this member around so we can retry deleting it. - *out = WorkingSet::INVALID_ID; - _commonStats.needYield++; - return NEED_YIELD; + wunit.commit(); } - // As restoreState may restore (recreate) cursors, cursors are tied to the - // transaction in which they are created, and a WriteUnitOfWork is a - // transaction, make sure to restore the state outside of the WritUnitOfWork. - try { - _child->restoreState(_txn); - } - catch ( const WriteConflictException& wce ) { - // Note we don't need to retry anything in this case since the delete already - // was committed. However, we still need to return the deleted document - // (if it was requested). - if (_params.returnDeleted) { - // member->obj should refer to the deleted document. - invariant(member->state == WorkingSetMember::OWNED_OBJ); - - _idReturning = id; - // Keep this member around so that we can return it on the next work() call. - memberFreer.Dismiss(); - } - *out = WorkingSet::INVALID_ID; - _commonStats.needYield++; - return NEED_YIELD; - } + ++_specificStats.docsDeleted; + } catch (const WriteConflictException& wce) { + _idRetrying = id; + memberFreer.Dismiss(); // Keep this member around so we can retry deleting it. + *out = WorkingSet::INVALID_ID; + _commonStats.needYield++; + return NEED_YIELD; + } + // As restoreState may restore (recreate) cursors, cursors are tied to the + // transaction in which they are created, and a WriteUnitOfWork is a + // transaction, make sure to restore the state outside of the WritUnitOfWork. + try { + _child->restoreState(_txn); + } catch (const WriteConflictException& wce) { + // Note we don't need to retry anything in this case since the delete already + // was committed. However, we still need to return the deleted document + // (if it was requested). if (_params.returnDeleted) { // member->obj should refer to the deleted document. invariant(member->state == WorkingSetMember::OWNED_OBJ); - memberFreer.Dismiss(); // Keep this member around so we can return it. - *out = id; - ++_commonStats.advanced; - return PlanStage::ADVANCED; - } - - ++_commonStats.needTime; - return PlanStage::NEED_TIME; - } - else if (PlanStage::FAILURE == status || PlanStage::DEAD == status) { - *out = id; - // If a stage fails, it may create a status WSM to indicate why it failed, in which case - // 'id' is valid. If ID is invalid, we create our own error message. - if (WorkingSet::INVALID_ID == id) { - const std::string errmsg = "delete stage failed to read in results from child"; - *out = WorkingSetCommon::allocateStatusMember(_ws, Status(ErrorCodes::InternalError, - errmsg)); + _idReturning = id; + // Keep this member around so that we can return it on the next work() call. + memberFreer.Dismiss(); } - return status; - } - else if (PlanStage::NEED_TIME == status) { - ++_commonStats.needTime; + *out = WorkingSet::INVALID_ID; + _commonStats.needYield++; + return NEED_YIELD; } - else if (PlanStage::NEED_YIELD == status) { + + if (_params.returnDeleted) { + // member->obj should refer to the deleted document. + invariant(member->state == WorkingSetMember::OWNED_OBJ); + + memberFreer.Dismiss(); // Keep this member around so we can return it. *out = id; - ++_commonStats.needYield; + ++_commonStats.advanced; + return PlanStage::ADVANCED; } + ++_commonStats.needTime; + return PlanStage::NEED_TIME; + } else if (PlanStage::FAILURE == status || PlanStage::DEAD == status) { + *out = id; + // If a stage fails, it may create a status WSM to indicate why it failed, in which case + // 'id' is valid. If ID is invalid, we create our own error message. + if (WorkingSet::INVALID_ID == id) { + const std::string errmsg = "delete stage failed to read in results from child"; + *out = WorkingSetCommon::allocateStatusMember( + _ws, Status(ErrorCodes::InternalError, errmsg)); + } return status; + } else if (PlanStage::NEED_TIME == status) { + ++_commonStats.needTime; + } else if (PlanStage::NEED_YIELD == status) { + *out = id; + ++_commonStats.needYield; } - void DeleteStage::saveState() { - _txn = NULL; - ++_commonStats.yields; - _child->saveState(); - } - - void DeleteStage::restoreState(OperationContext* opCtx) { - invariant(_txn == NULL); - _txn = opCtx; - ++_commonStats.unyields; - _child->restoreState(opCtx); - - const NamespaceString& ns(_collection->ns()); - massert(28537, - str::stream() << "Demoted from primary while removing from " << ns.ns(), - !_params.shouldCallLogOp || + return status; +} + +void DeleteStage::saveState() { + _txn = NULL; + ++_commonStats.yields; + _child->saveState(); +} + +void DeleteStage::restoreState(OperationContext* opCtx) { + invariant(_txn == NULL); + _txn = opCtx; + ++_commonStats.unyields; + _child->restoreState(opCtx); + + const NamespaceString& ns(_collection->ns()); + massert(28537, + str::stream() << "Demoted from primary while removing from " << ns.ns(), + !_params.shouldCallLogOp || repl::getGlobalReplicationCoordinator()->canAcceptWritesFor(ns)); - } - - void DeleteStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { - ++_commonStats.invalidates; - _child->invalidate(txn, dl, type); - } - - vector<PlanStage*> DeleteStage::getChildren() const { - vector<PlanStage*> children; - children.push_back(_child.get()); - return children; - } - - PlanStageStats* DeleteStage::getStats() { - _commonStats.isEOF = isEOF(); - unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_DELETE)); - ret->specific.reset(new DeleteStats(_specificStats)); - ret->children.push_back(_child->getStats()); - return ret.release(); - } - - const CommonStats* DeleteStage::getCommonStats() const { - return &_commonStats; - } - - const SpecificStats* DeleteStage::getSpecificStats() const { - return &_specificStats; - } - - // static - long long DeleteStage::getNumDeleted(PlanExecutor* exec) { - invariant(exec->getRootStage()->isEOF()); - invariant(exec->getRootStage()->stageType() == STAGE_DELETE); - DeleteStage* deleteStage = static_cast<DeleteStage*>(exec->getRootStage()); - const DeleteStats* deleteStats = - static_cast<const DeleteStats*>(deleteStage->getSpecificStats()); - return deleteStats->docsDeleted; - } +} + +void DeleteStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + ++_commonStats.invalidates; + _child->invalidate(txn, dl, type); +} + +vector<PlanStage*> DeleteStage::getChildren() const { + vector<PlanStage*> children; + children.push_back(_child.get()); + return children; +} + +PlanStageStats* DeleteStage::getStats() { + _commonStats.isEOF = isEOF(); + unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_DELETE)); + ret->specific.reset(new DeleteStats(_specificStats)); + ret->children.push_back(_child->getStats()); + return ret.release(); +} + +const CommonStats* DeleteStage::getCommonStats() const { + return &_commonStats; +} + +const SpecificStats* DeleteStage::getSpecificStats() const { + return &_specificStats; +} + +// static +long long DeleteStage::getNumDeleted(PlanExecutor* exec) { + invariant(exec->getRootStage()->isEOF()); + invariant(exec->getRootStage()->stageType() == STAGE_DELETE); + DeleteStage* deleteStage = static_cast<DeleteStage*>(exec->getRootStage()); + const DeleteStats* deleteStats = + static_cast<const DeleteStats*>(deleteStage->getSpecificStats()); + return deleteStats->docsDeleted; +} } // namespace mongo diff --git a/src/mongo/db/exec/delete.h b/src/mongo/db/exec/delete.h index 028cd086279..75556152680 100644 --- a/src/mongo/db/exec/delete.h +++ b/src/mongo/db/exec/delete.h @@ -34,109 +34,112 @@ namespace mongo { - class CanonicalQuery; - class OperationContext; - class PlanExecutor; +class CanonicalQuery; +class OperationContext; +class PlanExecutor; - struct DeleteStageParams { - DeleteStageParams() : - isMulti(false), - shouldCallLogOp(false), - fromMigrate(false), - isExplain(false), - returnDeleted(false), - canonicalQuery(NULL) { } +struct DeleteStageParams { + DeleteStageParams() + : isMulti(false), + shouldCallLogOp(false), + fromMigrate(false), + isExplain(false), + returnDeleted(false), + canonicalQuery(NULL) {} - // Should we delete all documents returned from the child (a "multi delete"), or at most one - // (a "single delete")? - bool isMulti; + // Should we delete all documents returned from the child (a "multi delete"), or at most one + // (a "single delete")? + bool isMulti; - // Should we write each delete to the oplog? - bool shouldCallLogOp; + // Should we write each delete to the oplog? + bool shouldCallLogOp; - // Is this delete part of a migrate operation that is essentially like a no-op - // when the cluster is observed by an external client. - bool fromMigrate; + // Is this delete part of a migrate operation that is essentially like a no-op + // when the cluster is observed by an external client. + bool fromMigrate; - // Are we explaining a delete command rather than actually executing it? - bool isExplain; + // Are we explaining a delete command rather than actually executing it? + bool isExplain; - // Should we return the document we just deleted? - bool returnDeleted; + // Should we return the document we just deleted? + bool returnDeleted; - // The parsed query predicate for this delete. Not owned here. - CanonicalQuery* canonicalQuery; - }; + // The parsed query predicate for this delete. Not owned here. + CanonicalQuery* canonicalQuery; +}; - /** - * This stage delete documents by RecordId that are returned from its child. If the deleted - * document was requested to be returned, then ADVANCED is returned after deleting a document. - * Otherwise, NEED_TIME is returned after deleting a document. - * - * Callers of work() must be holding a write lock (and, for shouldCallLogOp=true deletes, - * callers must have had the replication coordinator approve the write). - */ - class DeleteStage : public PlanStage { - MONGO_DISALLOW_COPYING(DeleteStage); - public: - DeleteStage(OperationContext* txn, - const DeleteStageParams& params, - WorkingSet* ws, - Collection* collection, - PlanStage* child); - virtual ~DeleteStage(); +/** + * This stage delete documents by RecordId that are returned from its child. If the deleted + * document was requested to be returned, then ADVANCED is returned after deleting a document. + * Otherwise, NEED_TIME is returned after deleting a document. + * + * Callers of work() must be holding a write lock (and, for shouldCallLogOp=true deletes, + * callers must have had the replication coordinator approve the write). + */ +class DeleteStage : public PlanStage { + MONGO_DISALLOW_COPYING(DeleteStage); + +public: + DeleteStage(OperationContext* txn, + const DeleteStageParams& params, + WorkingSet* ws, + Collection* collection, + PlanStage* child); + virtual ~DeleteStage(); - virtual bool isEOF(); - virtual StageState work(WorkingSetID* out); + virtual bool isEOF(); + virtual StageState work(WorkingSetID* out); - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - virtual std::vector<PlanStage*> getChildren() const; + virtual std::vector<PlanStage*> getChildren() const; - virtual StageType stageType() const { return STAGE_DELETE; } + virtual StageType stageType() const { + return STAGE_DELETE; + } - virtual PlanStageStats* getStats(); + virtual PlanStageStats* getStats(); - virtual const CommonStats* getCommonStats() const; + virtual const CommonStats* getCommonStats() const; - virtual const SpecificStats* getSpecificStats() const; + virtual const SpecificStats* getSpecificStats() const; - static const char* kStageType; + static const char* kStageType; - /** - * Extracts the number of documents deleted by the update plan 'exec'. - * - * Should only be called if the root plan stage of 'exec' is UPDATE and if 'exec' is EOF. - */ - static long long getNumDeleted(PlanExecutor* exec); + /** + * Extracts the number of documents deleted by the update plan 'exec'. + * + * Should only be called if the root plan stage of 'exec' is UPDATE and if 'exec' is EOF. + */ + static long long getNumDeleted(PlanExecutor* exec); - private: - // Transactional context. Not owned by us. - OperationContext* _txn; +private: + // Transactional context. Not owned by us. + OperationContext* _txn; - DeleteStageParams _params; + DeleteStageParams _params; - // Not owned by us. - WorkingSet* _ws; + // Not owned by us. + WorkingSet* _ws; - // Collection to operate on. Not owned by us. Can be NULL (if NULL, isEOF() will always - // return true). If non-NULL, the lifetime of the collection must supersede that of the - // stage. - Collection* _collection; + // Collection to operate on. Not owned by us. Can be NULL (if NULL, isEOF() will always + // return true). If non-NULL, the lifetime of the collection must supersede that of the + // stage. + Collection* _collection; - std::unique_ptr<PlanStage> _child; + std::unique_ptr<PlanStage> _child; - // If not WorkingSet::INVALID_ID, we use this rather than asking our child what to do next. - WorkingSetID _idRetrying; + // If not WorkingSet::INVALID_ID, we use this rather than asking our child what to do next. + WorkingSetID _idRetrying; - // If not WorkingSet::INVALID_ID, we return this member to our caller. - WorkingSetID _idReturning; + // If not WorkingSet::INVALID_ID, we return this member to our caller. + WorkingSetID _idReturning; - // Stats - CommonStats _commonStats; - DeleteStats _specificStats; - }; + // Stats + CommonStats _commonStats; + DeleteStats _specificStats; +}; } // namespace mongo diff --git a/src/mongo/db/exec/distinct_scan.cpp b/src/mongo/db/exec/distinct_scan.cpp index 24dcc36019a..7cbe5db389e 100644 --- a/src/mongo/db/exec/distinct_scan.cpp +++ b/src/mongo/db/exec/distinct_scan.cpp @@ -37,60 +37,62 @@ namespace mongo { - using std::unique_ptr; - using std::vector; - - // static - const char* DistinctScan::kStageType = "DISTINCT_SCAN"; - - DistinctScan::DistinctScan(OperationContext* txn, const DistinctParams& params, WorkingSet* workingSet) - : _txn(txn), - _workingSet(workingSet), - _descriptor(params.descriptor), - _iam(params.descriptor->getIndexCatalog()->getIndex(params.descriptor)), - _params(params), - _checker(&_params.bounds, _descriptor->keyPattern(), _params.direction), - _commonStats(kStageType) { - - _specificStats.keyPattern = _params.descriptor->keyPattern(); - _specificStats.indexName = _params.descriptor->indexName(); - _specificStats.indexVersion = _params.descriptor->version(); - - // Set up our initial seek. If there is no valid data, just mark as EOF. - _commonStats.isEOF = !_checker.getStartSeekPoint(&_seekPoint); +using std::unique_ptr; +using std::vector; + +// static +const char* DistinctScan::kStageType = "DISTINCT_SCAN"; + +DistinctScan::DistinctScan(OperationContext* txn, + const DistinctParams& params, + WorkingSet* workingSet) + : _txn(txn), + _workingSet(workingSet), + _descriptor(params.descriptor), + _iam(params.descriptor->getIndexCatalog()->getIndex(params.descriptor)), + _params(params), + _checker(&_params.bounds, _descriptor->keyPattern(), _params.direction), + _commonStats(kStageType) { + _specificStats.keyPattern = _params.descriptor->keyPattern(); + _specificStats.indexName = _params.descriptor->indexName(); + _specificStats.indexVersion = _params.descriptor->version(); + + // Set up our initial seek. If there is no valid data, just mark as EOF. + _commonStats.isEOF = !_checker.getStartSeekPoint(&_seekPoint); +} + +PlanStage::StageState DistinctScan::work(WorkingSetID* out) { + ++_commonStats.works; + if (_commonStats.isEOF) + return PlanStage::IS_EOF; + + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); + + boost::optional<IndexKeyEntry> kv; + try { + if (!_cursor) + _cursor = _iam->newCursor(_txn, _params.direction == 1); + kv = _cursor->seek(_seekPoint); + } catch (const WriteConflictException& wce) { + *out = WorkingSet::INVALID_ID; + return PlanStage::NEED_YIELD; } - PlanStage::StageState DistinctScan::work(WorkingSetID* out) { - ++_commonStats.works; - if (_commonStats.isEOF) return PlanStage::IS_EOF; - - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); - - boost::optional<IndexKeyEntry> kv; - try { - if (!_cursor) _cursor = _iam->newCursor(_txn, _params.direction == 1); - kv = _cursor->seek(_seekPoint); - } - catch (const WriteConflictException& wce) { - *out = WorkingSet::INVALID_ID; - return PlanStage::NEED_YIELD; - } - - if (!kv) { - _commonStats.isEOF = true; - return PlanStage::IS_EOF; - } + if (!kv) { + _commonStats.isEOF = true; + return PlanStage::IS_EOF; + } - ++_specificStats.keysExamined; + ++_specificStats.keysExamined; - switch (_checker.checkKey(kv->key, &_seekPoint)) { - case IndexBoundsChecker::MUST_ADVANCE: + switch (_checker.checkKey(kv->key, &_seekPoint)) { + case IndexBoundsChecker::MUST_ADVANCE: // Try again next time. The checker has adjusted the _seekPoint. ++_commonStats.needTime; return PlanStage::NEED_TIME; - case IndexBoundsChecker::DONE: + case IndexBoundsChecker::DONE: // There won't be a next time. _commonStats.isEOF = true; _cursor.reset(); @@ -99,8 +101,9 @@ namespace mongo { case IndexBoundsChecker::VALID: // Return this key. Adjust the _seekPoint so that it is exclusive on the field we // are using. - - if (!kv->key.isOwned()) kv->key = kv->key.getOwned(); + + if (!kv->key.isOwned()) + kv->key = kv->key.getOwned(); _seekPoint.keyPrefix = kv->key; _seekPoint.prefixLen = _params.fieldNo + 1; _seekPoint.prefixExclusive = true; @@ -115,51 +118,53 @@ namespace mongo { *out = id; ++_commonStats.advanced; return PlanStage::ADVANCED; - } - invariant(false); - } - - bool DistinctScan::isEOF() { - return _commonStats.isEOF; - } - - void DistinctScan::saveState() { - _txn = NULL; - ++_commonStats.yields; - - // We always seek, so we don't care where the cursor is. - if (_cursor) _cursor->saveUnpositioned(); - } - - void DistinctScan::restoreState(OperationContext* opCtx) { - invariant(_txn == NULL); - _txn = opCtx; - ++_commonStats.unyields; - - if (_cursor) _cursor->restore(opCtx); - } - - void DistinctScan::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { - ++_commonStats.invalidates; - } - - vector<PlanStage*> DistinctScan::getChildren() const { - vector<PlanStage*> empty; - return empty; - } - - PlanStageStats* DistinctScan::getStats() { - unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_DISTINCT_SCAN)); - ret->specific.reset(new DistinctScanStats(_specificStats)); - return ret.release(); - } - - const CommonStats* DistinctScan::getCommonStats() const { - return &_commonStats; - } - - const SpecificStats* DistinctScan::getSpecificStats() const { - return &_specificStats; } + invariant(false); +} + +bool DistinctScan::isEOF() { + return _commonStats.isEOF; +} + +void DistinctScan::saveState() { + _txn = NULL; + ++_commonStats.yields; + + // We always seek, so we don't care where the cursor is. + if (_cursor) + _cursor->saveUnpositioned(); +} + +void DistinctScan::restoreState(OperationContext* opCtx) { + invariant(_txn == NULL); + _txn = opCtx; + ++_commonStats.unyields; + + if (_cursor) + _cursor->restore(opCtx); +} + +void DistinctScan::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + ++_commonStats.invalidates; +} + +vector<PlanStage*> DistinctScan::getChildren() const { + vector<PlanStage*> empty; + return empty; +} + +PlanStageStats* DistinctScan::getStats() { + unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_DISTINCT_SCAN)); + ret->specific.reset(new DistinctScanStats(_specificStats)); + return ret.release(); +} + +const CommonStats* DistinctScan::getCommonStats() const { + return &_commonStats; +} + +const SpecificStats* DistinctScan::getSpecificStats() const { + return &_specificStats; +} } // namespace mongo diff --git a/src/mongo/db/exec/distinct_scan.h b/src/mongo/db/exec/distinct_scan.h index d87f5249535..10f850d5a2a 100644 --- a/src/mongo/db/exec/distinct_scan.h +++ b/src/mongo/db/exec/distinct_scan.h @@ -39,87 +39,87 @@ namespace mongo { - class IndexAccessMethod; - class IndexDescriptor; - class WorkingSet; +class IndexAccessMethod; +class IndexDescriptor; +class WorkingSet; - struct DistinctParams { - DistinctParams() : descriptor(NULL), - direction(1), - fieldNo(0) { } +struct DistinctParams { + DistinctParams() : descriptor(NULL), direction(1), fieldNo(0) {} - // What index are we traversing? - const IndexDescriptor* descriptor; + // What index are we traversing? + const IndexDescriptor* descriptor; - // And in what direction? - int direction; + // And in what direction? + int direction; - // What are the bounds? - IndexBounds bounds; + // What are the bounds? + IndexBounds bounds; - // What field in the index's key pattern is the one we're distinct-ing over? - // For example: - // If we have an index {a:1, b:1} we could use it to distinct over either 'a' or 'b'. - // If we distinct over 'a' the position is 0. - // If we distinct over 'b' the position is 1. - int fieldNo; - }; + // What field in the index's key pattern is the one we're distinct-ing over? + // For example: + // If we have an index {a:1, b:1} we could use it to distinct over either 'a' or 'b'. + // If we distinct over 'a' the position is 0. + // If we distinct over 'b' the position is 1. + int fieldNo; +}; - /** - * Used by the distinct command. Executes a mutated index scan over the provided bounds. - * However, rather than looking at every key in the bounds, it skips to the next value of the - * _params.fieldNo-th indexed field. This is because distinct only cares about distinct values - * for that field, so there is no point in examining all keys with the same value for that - * field. - * - * Only created through the getExecutorDistinct path. See db/query/get_executor.cpp - */ - class DistinctScan : public PlanStage { - public: - DistinctScan(OperationContext* txn, const DistinctParams& params, WorkingSet* workingSet); - virtual ~DistinctScan() { } +/** + * Used by the distinct command. Executes a mutated index scan over the provided bounds. + * However, rather than looking at every key in the bounds, it skips to the next value of the + * _params.fieldNo-th indexed field. This is because distinct only cares about distinct values + * for that field, so there is no point in examining all keys with the same value for that + * field. + * + * Only created through the getExecutorDistinct path. See db/query/get_executor.cpp + */ +class DistinctScan : public PlanStage { +public: + DistinctScan(OperationContext* txn, const DistinctParams& params, WorkingSet* workingSet); + virtual ~DistinctScan() {} - virtual StageState work(WorkingSetID* out); - virtual bool isEOF(); - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + virtual StageState work(WorkingSetID* out); + virtual bool isEOF(); + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - virtual std::vector<PlanStage*> getChildren() const; + virtual std::vector<PlanStage*> getChildren() const; - virtual StageType stageType() const { return STAGE_DISTINCT_SCAN; } + virtual StageType stageType() const { + return STAGE_DISTINCT_SCAN; + } - virtual PlanStageStats* getStats(); + virtual PlanStageStats* getStats(); - virtual const CommonStats* getCommonStats() const; + virtual const CommonStats* getCommonStats() const; - virtual const SpecificStats* getSpecificStats() const; + virtual const SpecificStats* getSpecificStats() const; - static const char* kStageType; + static const char* kStageType; - private: - // transactional context for read locks. Not owned by us - OperationContext* _txn; +private: + // transactional context for read locks. Not owned by us + OperationContext* _txn; - // The WorkingSet we annotate with results. Not owned by us. - WorkingSet* _workingSet; + // The WorkingSet we annotate with results. Not owned by us. + WorkingSet* _workingSet; - // Index access. - const IndexDescriptor* _descriptor; // owned by Collection -> IndexCatalog - const IndexAccessMethod* _iam; // owned by Collection -> IndexCatalog + // Index access. + const IndexDescriptor* _descriptor; // owned by Collection -> IndexCatalog + const IndexAccessMethod* _iam; // owned by Collection -> IndexCatalog - // The cursor we use to navigate the tree. - std::unique_ptr<SortedDataInterface::Cursor> _cursor; + // The cursor we use to navigate the tree. + std::unique_ptr<SortedDataInterface::Cursor> _cursor; - DistinctParams _params; + DistinctParams _params; - // _checker gives us our start key and ensures we stay in bounds. - IndexBoundsChecker _checker; - IndexSeekPoint _seekPoint; + // _checker gives us our start key and ensures we stay in bounds. + IndexBoundsChecker _checker; + IndexSeekPoint _seekPoint; - // Stats - CommonStats _commonStats; - DistinctScanStats _specificStats; - }; + // Stats + CommonStats _commonStats; + DistinctScanStats _specificStats; +}; } // namespace mongo diff --git a/src/mongo/db/exec/eof.cpp b/src/mongo/db/exec/eof.cpp index 1766c9b7a3e..4318e6ad9f6 100644 --- a/src/mongo/db/exec/eof.cpp +++ b/src/mongo/db/exec/eof.cpp @@ -34,54 +34,54 @@ namespace mongo { - using std::vector; +using std::vector; - // static - const char* EOFStage::kStageType = "EOF"; +// static +const char* EOFStage::kStageType = "EOF"; - EOFStage::EOFStage() : _commonStats(kStageType) { } +EOFStage::EOFStage() : _commonStats(kStageType) {} - EOFStage::~EOFStage() { } +EOFStage::~EOFStage() {} - bool EOFStage::isEOF() { - return true; - } +bool EOFStage::isEOF() { + return true; +} - PlanStage::StageState EOFStage::work(WorkingSetID* out) { - ++_commonStats.works; - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); - return PlanStage::IS_EOF; - } +PlanStage::StageState EOFStage::work(WorkingSetID* out) { + ++_commonStats.works; + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); + return PlanStage::IS_EOF; +} - void EOFStage::saveState() { - ++_commonStats.yields; - } +void EOFStage::saveState() { + ++_commonStats.yields; +} - void EOFStage::restoreState(OperationContext* opCtx) { - ++_commonStats.unyields; - } +void EOFStage::restoreState(OperationContext* opCtx) { + ++_commonStats.unyields; +} - void EOFStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { - ++_commonStats.invalidates; - } +void EOFStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + ++_commonStats.invalidates; +} - vector<PlanStage*> EOFStage::getChildren() const { - vector<PlanStage*> empty; - return empty; - } +vector<PlanStage*> EOFStage::getChildren() const { + vector<PlanStage*> empty; + return empty; +} - PlanStageStats* EOFStage::getStats() { - _commonStats.isEOF = isEOF(); - return new PlanStageStats(_commonStats, STAGE_EOF); - } +PlanStageStats* EOFStage::getStats() { + _commonStats.isEOF = isEOF(); + return new PlanStageStats(_commonStats, STAGE_EOF); +} - const CommonStats* EOFStage::getCommonStats() const { - return &_commonStats; - } +const CommonStats* EOFStage::getCommonStats() const { + return &_commonStats; +} - const SpecificStats* EOFStage::getSpecificStats() const { - return nullptr; - } +const SpecificStats* EOFStage::getSpecificStats() const { + return nullptr; +} } // namespace mongo diff --git a/src/mongo/db/exec/eof.h b/src/mongo/db/exec/eof.h index 247453d1d03..c81b6fdefc3 100644 --- a/src/mongo/db/exec/eof.h +++ b/src/mongo/db/exec/eof.h @@ -33,36 +33,38 @@ namespace mongo { - /** - * This stage just returns EOF immediately. - */ - class EOFStage : public PlanStage { - public: - EOFStage(); +/** + * This stage just returns EOF immediately. + */ +class EOFStage : public PlanStage { +public: + EOFStage(); - virtual ~EOFStage(); + virtual ~EOFStage(); - virtual bool isEOF(); - virtual StageState work(WorkingSetID* out); + virtual bool isEOF(); + virtual StageState work(WorkingSetID* out); - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - virtual std::vector<PlanStage*> getChildren() const; + virtual std::vector<PlanStage*> getChildren() const; - virtual StageType stageType() const { return STAGE_EOF; } + virtual StageType stageType() const { + return STAGE_EOF; + } - PlanStageStats* getStats(); + PlanStageStats* getStats(); - virtual const CommonStats* getCommonStats() const; + virtual const CommonStats* getCommonStats() const; - virtual const SpecificStats* getSpecificStats() const; + virtual const SpecificStats* getSpecificStats() const; - static const char* kStageType; + static const char* kStageType; - private: - CommonStats _commonStats; - }; +private: + CommonStats _commonStats; +}; } // namespace mongo diff --git a/src/mongo/db/exec/fetch.cpp b/src/mongo/db/exec/fetch.cpp index 817bc72fc8d..cab7655f2f0 100644 --- a/src/mongo/db/exec/fetch.cpp +++ b/src/mongo/db/exec/fetch.cpp @@ -41,216 +41,214 @@ namespace mongo { - using std::unique_ptr; - using std::vector; - - // static - const char* FetchStage::kStageType = "FETCH"; - - FetchStage::FetchStage(OperationContext* txn, - WorkingSet* ws, - PlanStage* child, - const MatchExpression* filter, - const Collection* collection) - : _txn(txn), - _collection(collection), - _ws(ws), - _child(child), - _filter(filter), - _idRetrying(WorkingSet::INVALID_ID), - _commonStats(kStageType) { } - - FetchStage::~FetchStage() { } - - bool FetchStage::isEOF() { - if (WorkingSet::INVALID_ID != _idRetrying) { - // We asked the parent for a page-in, but still haven't had a chance to return the - // paged in document - return false; - } - - return _child->isEOF(); +using std::unique_ptr; +using std::vector; + +// static +const char* FetchStage::kStageType = "FETCH"; + +FetchStage::FetchStage(OperationContext* txn, + WorkingSet* ws, + PlanStage* child, + const MatchExpression* filter, + const Collection* collection) + : _txn(txn), + _collection(collection), + _ws(ws), + _child(child), + _filter(filter), + _idRetrying(WorkingSet::INVALID_ID), + _commonStats(kStageType) {} + +FetchStage::~FetchStage() {} + +bool FetchStage::isEOF() { + if (WorkingSet::INVALID_ID != _idRetrying) { + // We asked the parent for a page-in, but still haven't had a chance to return the + // paged in document + return false; } - PlanStage::StageState FetchStage::work(WorkingSetID* out) { - ++_commonStats.works; + return _child->isEOF(); +} - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); +PlanStage::StageState FetchStage::work(WorkingSetID* out) { + ++_commonStats.works; - if (isEOF()) { return PlanStage::IS_EOF; } + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); - // Either retry the last WSM we worked on or get a new one from our child. - WorkingSetID id; - StageState status; - if (_idRetrying == WorkingSet::INVALID_ID) { - status = _child->work(&id); - } - else { - status = ADVANCED; - id = _idRetrying; - _idRetrying = WorkingSet::INVALID_ID; - } + if (isEOF()) { + return PlanStage::IS_EOF; + } - if (PlanStage::ADVANCED == status) { - WorkingSetMember* member = _ws->get(id); + // Either retry the last WSM we worked on or get a new one from our child. + WorkingSetID id; + StageState status; + if (_idRetrying == WorkingSet::INVALID_ID) { + status = _child->work(&id); + } else { + status = ADVANCED; + id = _idRetrying; + _idRetrying = WorkingSet::INVALID_ID; + } - // If there's an obj there, there is no fetching to perform. - if (member->hasObj()) { - ++_specificStats.alreadyHasObj; - } - else { - // We need a valid loc to fetch from and this is the only state that has one. - verify(WorkingSetMember::LOC_AND_IDX == member->state); - verify(member->hasLoc()); - - try { - if (!_cursor) _cursor = _collection->getCursor(_txn); - - if (auto fetcher = _cursor->fetcherForId(member->loc)) { - // There's something to fetch. Hand the fetcher off to the WSM, and pass up - // a fetch request. - _idRetrying = id; - member->setFetcher(fetcher.release()); - *out = id; - _commonStats.needYield++; - return NEED_YIELD; - } - - // The doc is already in memory, so go ahead and grab it. Now we have a RecordId - // as well as an unowned object - if (!WorkingSetCommon::fetch(_txn, member, _cursor)) { - _ws->free(id); - _commonStats.needTime++; - return NEED_TIME; - } - } - catch (const WriteConflictException& wce) { + if (PlanStage::ADVANCED == status) { + WorkingSetMember* member = _ws->get(id); + + // If there's an obj there, there is no fetching to perform. + if (member->hasObj()) { + ++_specificStats.alreadyHasObj; + } else { + // We need a valid loc to fetch from and this is the only state that has one. + verify(WorkingSetMember::LOC_AND_IDX == member->state); + verify(member->hasLoc()); + + try { + if (!_cursor) + _cursor = _collection->getCursor(_txn); + + if (auto fetcher = _cursor->fetcherForId(member->loc)) { + // There's something to fetch. Hand the fetcher off to the WSM, and pass up + // a fetch request. _idRetrying = id; - *out = WorkingSet::INVALID_ID; + member->setFetcher(fetcher.release()); + *out = id; _commonStats.needYield++; return NEED_YIELD; } - } - return returnIfMatches(member, id, out); - } - else if (PlanStage::FAILURE == status || PlanStage::DEAD == status) { - *out = id; - // If a stage fails, it may create a status WSM to indicate why it - // failed, in which case 'id' is valid. If ID is invalid, we - // create our own error message. - if (WorkingSet::INVALID_ID == id) { - mongoutils::str::stream ss; - ss << "fetch stage failed to read in results from child"; - Status status(ErrorCodes::InternalError, ss); - *out = WorkingSetCommon::allocateStatusMember( _ws, status); + // The doc is already in memory, so go ahead and grab it. Now we have a RecordId + // as well as an unowned object + if (!WorkingSetCommon::fetch(_txn, member, _cursor)) { + _ws->free(id); + _commonStats.needTime++; + return NEED_TIME; + } + } catch (const WriteConflictException& wce) { + _idRetrying = id; + *out = WorkingSet::INVALID_ID; + _commonStats.needYield++; + return NEED_YIELD; } - return status; - } - else if (PlanStage::NEED_TIME == status) { - ++_commonStats.needTime; - } - else if (PlanStage::NEED_YIELD == status) { - ++_commonStats.needYield; - *out = id; } + return returnIfMatches(member, id, out); + } else if (PlanStage::FAILURE == status || PlanStage::DEAD == status) { + *out = id; + // If a stage fails, it may create a status WSM to indicate why it + // failed, in which case 'id' is valid. If ID is invalid, we + // create our own error message. + if (WorkingSet::INVALID_ID == id) { + mongoutils::str::stream ss; + ss << "fetch stage failed to read in results from child"; + Status status(ErrorCodes::InternalError, ss); + *out = WorkingSetCommon::allocateStatusMember(_ws, status); + } return status; + } else if (PlanStage::NEED_TIME == status) { + ++_commonStats.needTime; + } else if (PlanStage::NEED_YIELD == status) { + ++_commonStats.needYield; + *out = id; } - void FetchStage::saveState() { - _txn = NULL; - ++_commonStats.yields; - if (_cursor) _cursor->saveUnpositioned(); - _child->saveState(); - } - - void FetchStage::restoreState(OperationContext* opCtx) { - invariant(_txn == NULL); - _txn = opCtx; - ++_commonStats.unyields; - if (_cursor) _cursor->restore(opCtx); - _child->restoreState(opCtx); - } - - void FetchStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { - ++_commonStats.invalidates; - - _child->invalidate(txn, dl, type); - - // It's possible that the loc getting invalidated is the one we're about to - // fetch. In this case we do a "forced fetch" and put the WSM in owned object state. - if (WorkingSet::INVALID_ID != _idRetrying) { - WorkingSetMember* member = _ws->get(_idRetrying); - if (member->hasLoc() && (member->loc == dl)) { - // Fetch it now and kill the diskloc. - WorkingSetCommon::fetchAndInvalidateLoc(txn, member, _collection); - } + return status; +} + +void FetchStage::saveState() { + _txn = NULL; + ++_commonStats.yields; + if (_cursor) + _cursor->saveUnpositioned(); + _child->saveState(); +} + +void FetchStage::restoreState(OperationContext* opCtx) { + invariant(_txn == NULL); + _txn = opCtx; + ++_commonStats.unyields; + if (_cursor) + _cursor->restore(opCtx); + _child->restoreState(opCtx); +} + +void FetchStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + ++_commonStats.invalidates; + + _child->invalidate(txn, dl, type); + + // It's possible that the loc getting invalidated is the one we're about to + // fetch. In this case we do a "forced fetch" and put the WSM in owned object state. + if (WorkingSet::INVALID_ID != _idRetrying) { + WorkingSetMember* member = _ws->get(_idRetrying); + if (member->hasLoc() && (member->loc == dl)) { + // Fetch it now and kill the diskloc. + WorkingSetCommon::fetchAndInvalidateLoc(txn, member, _collection); } } - - PlanStage::StageState FetchStage::returnIfMatches(WorkingSetMember* member, - WorkingSetID memberID, - WorkingSetID* out) { - // We consider "examining a document" to be every time that we pass a document through - // a filter by calling Filter::passes(...) below. Therefore, the 'docsExamined' metric - // is not always equal to the number of documents that were fetched from the collection. - // In particular, we can sometimes generate plans which have two fetch stages. The first - // one actually grabs the document from the collection, and the second passes the - // document through a second filter. - // - // One common example of this is geoNear. Suppose that a geoNear plan is searching an - // annulus to find 2dsphere-indexed documents near some point (x, y) on the globe. - // After fetching documents within geo hashes that intersect this annulus, the docs are - // fetched and filtered to make sure that they really do fall into this annulus. However, - // the user might also want to find only those documents for which accommodationType== - // "restaurant". The planner will add a second fetch stage to filter by this non-geo - // predicate. - ++_specificStats.docsExamined; - - if (Filter::passes(member, _filter)) { - *out = memberID; - - ++_commonStats.advanced; - return PlanStage::ADVANCED; - } - else { - _ws->free(memberID); - - ++_commonStats.needTime; - return PlanStage::NEED_TIME; - } +} + +PlanStage::StageState FetchStage::returnIfMatches(WorkingSetMember* member, + WorkingSetID memberID, + WorkingSetID* out) { + // We consider "examining a document" to be every time that we pass a document through + // a filter by calling Filter::passes(...) below. Therefore, the 'docsExamined' metric + // is not always equal to the number of documents that were fetched from the collection. + // In particular, we can sometimes generate plans which have two fetch stages. The first + // one actually grabs the document from the collection, and the second passes the + // document through a second filter. + // + // One common example of this is geoNear. Suppose that a geoNear plan is searching an + // annulus to find 2dsphere-indexed documents near some point (x, y) on the globe. + // After fetching documents within geo hashes that intersect this annulus, the docs are + // fetched and filtered to make sure that they really do fall into this annulus. However, + // the user might also want to find only those documents for which accommodationType== + // "restaurant". The planner will add a second fetch stage to filter by this non-geo + // predicate. + ++_specificStats.docsExamined; + + if (Filter::passes(member, _filter)) { + *out = memberID; + + ++_commonStats.advanced; + return PlanStage::ADVANCED; + } else { + _ws->free(memberID); + + ++_commonStats.needTime; + return PlanStage::NEED_TIME; } - - vector<PlanStage*> FetchStage::getChildren() const { - vector<PlanStage*> children; - children.push_back(_child.get()); - return children; +} + +vector<PlanStage*> FetchStage::getChildren() const { + vector<PlanStage*> children; + children.push_back(_child.get()); + return children; +} + +PlanStageStats* FetchStage::getStats() { + _commonStats.isEOF = isEOF(); + + // Add a BSON representation of the filter to the stats tree, if there is one. + if (NULL != _filter) { + BSONObjBuilder bob; + _filter->toBSON(&bob); + _commonStats.filter = bob.obj(); } - PlanStageStats* FetchStage::getStats() { - _commonStats.isEOF = isEOF(); - - // Add a BSON representation of the filter to the stats tree, if there is one. - if (NULL != _filter) { - BSONObjBuilder bob; - _filter->toBSON(&bob); - _commonStats.filter = bob.obj(); - } - - unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_FETCH)); - ret->specific.reset(new FetchStats(_specificStats)); - ret->children.push_back(_child->getStats()); - return ret.release(); - } + unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_FETCH)); + ret->specific.reset(new FetchStats(_specificStats)); + ret->children.push_back(_child->getStats()); + return ret.release(); +} - const CommonStats* FetchStage::getCommonStats() const { - return &_commonStats; - } +const CommonStats* FetchStage::getCommonStats() const { + return &_commonStats; +} - const SpecificStats* FetchStage::getSpecificStats() const { - return &_specificStats; - } +const SpecificStats* FetchStage::getSpecificStats() const { + return &_specificStats; +} } // namespace mongo diff --git a/src/mongo/db/exec/fetch.h b/src/mongo/db/exec/fetch.h index b43a38bb7eb..5fba058b730 100644 --- a/src/mongo/db/exec/fetch.h +++ b/src/mongo/db/exec/fetch.h @@ -37,75 +37,75 @@ namespace mongo { - class RecordCursor; +class RecordCursor; - /** - * This stage turns a RecordId into a BSONObj. - * - * In WorkingSetMember terms, it transitions from LOC_AND_IDX to LOC_AND_UNOWNED_OBJ by reading - * the record at the provided loc. Returns verbatim any data that already has an object. - * - * Preconditions: Valid RecordId. - */ - class FetchStage : public PlanStage { - public: - FetchStage(OperationContext* txn, - WorkingSet* ws, - PlanStage* child, - const MatchExpression* filter, - const Collection* collection); - - virtual ~FetchStage(); +/** + * This stage turns a RecordId into a BSONObj. + * + * In WorkingSetMember terms, it transitions from LOC_AND_IDX to LOC_AND_UNOWNED_OBJ by reading + * the record at the provided loc. Returns verbatim any data that already has an object. + * + * Preconditions: Valid RecordId. + */ +class FetchStage : public PlanStage { +public: + FetchStage(OperationContext* txn, + WorkingSet* ws, + PlanStage* child, + const MatchExpression* filter, + const Collection* collection); - virtual bool isEOF(); - virtual StageState work(WorkingSetID* out); + virtual ~FetchStage(); - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + virtual bool isEOF(); + virtual StageState work(WorkingSetID* out); - virtual std::vector<PlanStage*> getChildren() const; + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - virtual StageType stageType() const { return STAGE_FETCH; } + virtual std::vector<PlanStage*> getChildren() const; - PlanStageStats* getStats(); + virtual StageType stageType() const { + return STAGE_FETCH; + } - virtual const CommonStats* getCommonStats() const; + PlanStageStats* getStats(); - virtual const SpecificStats* getSpecificStats() const; + virtual const CommonStats* getCommonStats() const; - static const char* kStageType; + virtual const SpecificStats* getSpecificStats() const; - private: + static const char* kStageType; - /** - * If the member (with id memberID) passes our filter, set *out to memberID and return that - * ADVANCED. Otherwise, free memberID and return NEED_TIME. - */ - StageState returnIfMatches(WorkingSetMember* member, WorkingSetID memberID, - WorkingSetID* out); +private: + /** + * If the member (with id memberID) passes our filter, set *out to memberID and return that + * ADVANCED. Otherwise, free memberID and return NEED_TIME. + */ + StageState returnIfMatches(WorkingSetMember* member, WorkingSetID memberID, WorkingSetID* out); - OperationContext* _txn; + OperationContext* _txn; - // Collection which is used by this stage. Used to resolve record ids retrieved by child - // stages. The lifetime of the collection must supersede that of the stage. - const Collection* _collection; - // Used to fetch Records from _collection. - std::unique_ptr<RecordCursor> _cursor; + // Collection which is used by this stage. Used to resolve record ids retrieved by child + // stages. The lifetime of the collection must supersede that of the stage. + const Collection* _collection; + // Used to fetch Records from _collection. + std::unique_ptr<RecordCursor> _cursor; - // _ws is not owned by us. - WorkingSet* _ws; - std::unique_ptr<PlanStage> _child; + // _ws is not owned by us. + WorkingSet* _ws; + std::unique_ptr<PlanStage> _child; - // The filter is not owned by us. - const MatchExpression* _filter; + // The filter is not owned by us. + const MatchExpression* _filter; - // If not Null, we use this rather than asking our child what to do next. - WorkingSetID _idRetrying; + // If not Null, we use this rather than asking our child what to do next. + WorkingSetID _idRetrying; - // Stats - CommonStats _commonStats; - FetchStats _specificStats; - }; + // Stats + CommonStats _commonStats; + FetchStats _specificStats; +}; } // namespace mongo diff --git a/src/mongo/db/exec/filter.h b/src/mongo/db/exec/filter.h index 802c68993e5..e23e116dfd9 100644 --- a/src/mongo/db/exec/filter.h +++ b/src/mongo/db/exec/filter.h @@ -34,82 +34,37 @@ namespace mongo { - /** - * The MatchExpression uses the MatchableDocument interface to see if a document satisfies the - * expression. This wraps a WorkingSetMember in the MatchableDocument interface so that any of - * the WorkingSetMember's various types can be tested to see if they satisfy an expression. - */ - class WorkingSetMatchableDocument : public MatchableDocument { - public: - WorkingSetMatchableDocument(WorkingSetMember* wsm) : _wsm(wsm) { } - virtual ~WorkingSetMatchableDocument() { } - - // This is only called by a $where query. The query system must be smart enough to realize - // that it should do a fetch beforehand. - BSONObj toBSON() const { - invariant(_wsm->hasObj()); - return _wsm->obj.value(); - } - - virtual ElementIterator* allocateIterator(const ElementPath* path) const { - // BSONElementIterator does some interesting things with arrays that I don't think - // SimpleArrayElementIterator does. - if (_wsm->hasObj()) { - return new BSONElementIterator(path, _wsm->obj.value()); - } - - // NOTE: This (kind of) duplicates code in WorkingSetMember::getFieldDotted. - // Keep in sync w/that. - // Find the first field in the index key data described by path and return an iterator - // over it. - for (size_t i = 0; i < _wsm->keyData.size(); ++i) { - BSONObjIterator keyPatternIt(_wsm->keyData[i].indexKeyPattern); - BSONObjIterator keyDataIt(_wsm->keyData[i].keyData); - - while (keyPatternIt.more()) { - BSONElement keyPatternElt = keyPatternIt.next(); - invariant(keyDataIt.more()); - BSONElement keyDataElt = keyDataIt.next(); - - if (path->fieldRef().equalsDottedField(keyPatternElt.fieldName())) { - if (Array == keyDataElt.type()) { - return new SimpleArrayElementIterator(keyDataElt, true); - } - else { - return new SingleElementElementIterator(keyDataElt); - } - } - } - } - - // This should not happen. - massert(16920, "trying to match on unknown field: " + path->fieldRef().dottedField().toString(), - 0); - - return new SingleElementElementIterator(BSONElement()); - } - - virtual void releaseIterator( ElementIterator* iterator ) const { - delete iterator; - } - - private: - WorkingSetMember* _wsm; - }; - - class IndexKeyMatchableDocument : public MatchableDocument { - public: - IndexKeyMatchableDocument(const BSONObj& key, - const BSONObj& keyPattern) - : _keyPattern(keyPattern), _key(key) { } - - BSONObj toBSON() const { - return _key; +/** + * The MatchExpression uses the MatchableDocument interface to see if a document satisfies the + * expression. This wraps a WorkingSetMember in the MatchableDocument interface so that any of + * the WorkingSetMember's various types can be tested to see if they satisfy an expression. + */ +class WorkingSetMatchableDocument : public MatchableDocument { +public: + WorkingSetMatchableDocument(WorkingSetMember* wsm) : _wsm(wsm) {} + virtual ~WorkingSetMatchableDocument() {} + + // This is only called by a $where query. The query system must be smart enough to realize + // that it should do a fetch beforehand. + BSONObj toBSON() const { + invariant(_wsm->hasObj()); + return _wsm->obj.value(); + } + + virtual ElementIterator* allocateIterator(const ElementPath* path) const { + // BSONElementIterator does some interesting things with arrays that I don't think + // SimpleArrayElementIterator does. + if (_wsm->hasObj()) { + return new BSONElementIterator(path, _wsm->obj.value()); } - virtual ElementIterator* allocateIterator(const ElementPath* path) const { - BSONObjIterator keyPatternIt(_keyPattern); - BSONObjIterator keyDataIt(_key); + // NOTE: This (kind of) duplicates code in WorkingSetMember::getFieldDotted. + // Keep in sync w/that. + // Find the first field in the index key data described by path and return an iterator + // over it. + for (size_t i = 0; i < _wsm->keyData.size(); ++i) { + BSONObjIterator keyPatternIt(_wsm->keyData[i].indexKeyPattern); + BSONObjIterator keyDataIt(_wsm->keyData[i].keyData); while (keyPatternIt.more()) { BSONElement keyPatternElt = keyPatternIt.next(); @@ -119,53 +74,99 @@ namespace mongo { if (path->fieldRef().equalsDottedField(keyPatternElt.fieldName())) { if (Array == keyDataElt.type()) { return new SimpleArrayElementIterator(keyDataElt, true); - } - else { + } else { return new SingleElementElementIterator(keyDataElt); } } } - - // Planning should not let this happen. - massert(17409, - "trying to match on unknown field: " + path->fieldRef().dottedField().toString(), - 0); - - return new SingleElementElementIterator(BSONElement()); } - virtual void releaseIterator(ElementIterator* iterator) const { - delete iterator; + // This should not happen. + massert(16920, + "trying to match on unknown field: " + path->fieldRef().dottedField().toString(), + 0); + + return new SingleElementElementIterator(BSONElement()); + } + + virtual void releaseIterator(ElementIterator* iterator) const { + delete iterator; + } + +private: + WorkingSetMember* _wsm; +}; + +class IndexKeyMatchableDocument : public MatchableDocument { +public: + IndexKeyMatchableDocument(const BSONObj& key, const BSONObj& keyPattern) + : _keyPattern(keyPattern), _key(key) {} + + BSONObj toBSON() const { + return _key; + } + + virtual ElementIterator* allocateIterator(const ElementPath* path) const { + BSONObjIterator keyPatternIt(_keyPattern); + BSONObjIterator keyDataIt(_key); + + while (keyPatternIt.more()) { + BSONElement keyPatternElt = keyPatternIt.next(); + invariant(keyDataIt.more()); + BSONElement keyDataElt = keyDataIt.next(); + + if (path->fieldRef().equalsDottedField(keyPatternElt.fieldName())) { + if (Array == keyDataElt.type()) { + return new SimpleArrayElementIterator(keyDataElt, true); + } else { + return new SingleElementElementIterator(keyDataElt); + } + } } - private: - BSONObj _keyPattern; - BSONObj _key; - }; + // Planning should not let this happen. + massert(17409, + "trying to match on unknown field: " + path->fieldRef().dottedField().toString(), + 0); + return new SingleElementElementIterator(BSONElement()); + } + + virtual void releaseIterator(ElementIterator* iterator) const { + delete iterator; + } + +private: + BSONObj _keyPattern; + BSONObj _key; +}; + +/** + * Used by every stage with a filter. + */ +class Filter { +public: /** - * Used by every stage with a filter. + * Returns true if filter is NULL or if 'wsm' satisfies the filter. + * Returns false if 'wsm' does not satisfy the filter. */ - class Filter { - public: - /** - * Returns true if filter is NULL or if 'wsm' satisfies the filter. - * Returns false if 'wsm' does not satisfy the filter. - */ - static bool passes(WorkingSetMember* wsm, const MatchExpression* filter) { - if (NULL == filter) { return true; } - WorkingSetMatchableDocument doc(wsm); - return filter->matches(&doc, NULL); + static bool passes(WorkingSetMember* wsm, const MatchExpression* filter) { + if (NULL == filter) { + return true; } - - static bool passes(const BSONObj& keyData, - const BSONObj& keyPattern, - const MatchExpression* filter) { - - if (NULL == filter) { return true; } - IndexKeyMatchableDocument doc(keyData, keyPattern); - return filter->matches(&doc, NULL); + WorkingSetMatchableDocument doc(wsm); + return filter->matches(&doc, NULL); + } + + static bool passes(const BSONObj& keyData, + const BSONObj& keyPattern, + const MatchExpression* filter) { + if (NULL == filter) { + return true; } - }; + IndexKeyMatchableDocument doc(keyData, keyPattern); + return filter->matches(&doc, NULL); + } +}; } // namespace mongo diff --git a/src/mongo/db/exec/geo_near.cpp b/src/mongo/db/exec/geo_near.cpp index 1776fd95a26..b07113f21fd 100644 --- a/src/mongo/db/exec/geo_near.cpp +++ b/src/mongo/db/exec/geo_near.cpp @@ -51,1247 +51,1159 @@ namespace mongo { - using std::abs; - using std::unique_ptr; +using std::abs; +using std::unique_ptr; - // - // Shared GeoNear search functionality - // - - static const double kCircOfEarthInMeters = 2 * M_PI * kRadiusOfEarthInMeters; - static const double kMaxEarthDistanceInMeters = kCircOfEarthInMeters / 2; - static const double kMetersPerDegreeAtEquator = kCircOfEarthInMeters / 360; - - namespace { - - /** - * Structure that holds BSON addresses (BSONElements) and the corresponding geometry parsed - * at those locations. - * Used to separate the parsing of geometries from a BSONObj (which must stay in scope) from - * the computation over those geometries. - * TODO: Merge with 2D/2DSphere key extraction? - */ - struct StoredGeometry { - - static StoredGeometry* parseFrom(const BSONElement& element) { - if (!element.isABSONObj()) - return NULL; - - unique_ptr<StoredGeometry> stored(new StoredGeometry); - if (!stored->geometry.parseFromStorage(element).isOK()) - return NULL; - stored->element = element; - return stored.release(); - } +// +// Shared GeoNear search functionality +// - BSONElement element; - GeometryContainer geometry; - }; - } - - /** - * Find and parse all geometry elements on the appropriate field path from the document. - */ - static void extractGeometries(const BSONObj& doc, - const string& path, - vector<StoredGeometry*>* geometries) { +static const double kCircOfEarthInMeters = 2 * M_PI * kRadiusOfEarthInMeters; +static const double kMaxEarthDistanceInMeters = kCircOfEarthInMeters / 2; +static const double kMetersPerDegreeAtEquator = kCircOfEarthInMeters / 360; - BSONElementSet geomElements; - // NOTE: Annoyingly, we cannot just expand arrays b/c single 2d points are arrays, we need - // to manually expand all results to check if they are geometries - doc.getFieldsDotted(path, geomElements, false /* expand arrays */); +namespace { - for (BSONElementSet::iterator it = geomElements.begin(); it != geomElements.end(); ++it) { +/** + * Structure that holds BSON addresses (BSONElements) and the corresponding geometry parsed + * at those locations. + * Used to separate the parsing of geometries from a BSONObj (which must stay in scope) from + * the computation over those geometries. + * TODO: Merge with 2D/2DSphere key extraction? + */ +struct StoredGeometry { + static StoredGeometry* parseFrom(const BSONElement& element) { + if (!element.isABSONObj()) + return NULL; + + unique_ptr<StoredGeometry> stored(new StoredGeometry); + if (!stored->geometry.parseFromStorage(element).isOK()) + return NULL; + stored->element = element; + return stored.release(); + } - const BSONElement& el = *it; - unique_ptr<StoredGeometry> stored(StoredGeometry::parseFrom(el)); + BSONElement element; + GeometryContainer geometry; +}; +} - if (stored.get()) { - // Valid geometry element - geometries->push_back(stored.release()); - } - else if (el.type() == Array) { - - // Many geometries may be in an array - BSONObjIterator arrIt(el.Obj()); - while (arrIt.more()) { - - const BSONElement nextEl = arrIt.next(); - stored.reset(StoredGeometry::parseFrom(nextEl)); - - if (stored.get()) { - // Valid geometry element - geometries->push_back(stored.release()); - } - else { - warning() << "geoNear stage read non-geometry element " << nextEl.toString() - << " in array " << el.toString(); - } +/** + * Find and parse all geometry elements on the appropriate field path from the document. + */ +static void extractGeometries(const BSONObj& doc, + const string& path, + vector<StoredGeometry*>* geometries) { + BSONElementSet geomElements; + // NOTE: Annoyingly, we cannot just expand arrays b/c single 2d points are arrays, we need + // to manually expand all results to check if they are geometries + doc.getFieldsDotted(path, geomElements, false /* expand arrays */); + + for (BSONElementSet::iterator it = geomElements.begin(); it != geomElements.end(); ++it) { + const BSONElement& el = *it; + unique_ptr<StoredGeometry> stored(StoredGeometry::parseFrom(el)); + + if (stored.get()) { + // Valid geometry element + geometries->push_back(stored.release()); + } else if (el.type() == Array) { + // Many geometries may be in an array + BSONObjIterator arrIt(el.Obj()); + while (arrIt.more()) { + const BSONElement nextEl = arrIt.next(); + stored.reset(StoredGeometry::parseFrom(nextEl)); + + if (stored.get()) { + // Valid geometry element + geometries->push_back(stored.release()); + } else { + warning() << "geoNear stage read non-geometry element " << nextEl.toString() + << " in array " << el.toString(); } } - else { - warning() << "geoNear stage read non-geometry element " << el.toString(); - } + } else { + warning() << "geoNear stage read non-geometry element " << el.toString(); } } +} - static StatusWith<double> computeGeoNearDistance(const GeoNearParams& nearParams, - WorkingSetMember* member) { - - // - // Generic GeoNear distance computation - // Distances are computed by projecting the stored geometry into the query CRS, and - // computing distance in that CRS. - // - - // Must have an object in order to get geometry out of it. - invariant(member->hasObj()); - - CRS queryCRS = nearParams.nearQuery->centroid->crs; - - // Extract all the geometries out of this document for the near query - OwnedPointerVector<StoredGeometry> geometriesOwned; - vector<StoredGeometry*>& geometries = geometriesOwned.mutableVector(); - extractGeometries(member->obj.value(), nearParams.nearQuery->field, &geometries); - - // Compute the minimum distance of all the geometries in the document - double minDistance = -1; - BSONObj minDistanceObj; - for (vector<StoredGeometry*>::iterator it = geometries.begin(); it != geometries.end(); - ++it) { +static StatusWith<double> computeGeoNearDistance(const GeoNearParams& nearParams, + WorkingSetMember* member) { + // + // Generic GeoNear distance computation + // Distances are computed by projecting the stored geometry into the query CRS, and + // computing distance in that CRS. + // - StoredGeometry& stored = **it; + // Must have an object in order to get geometry out of it. + invariant(member->hasObj()); - // NOTE: A stored document with STRICT_SPHERE CRS is treated as a malformed document - // and ignored. Since GeoNear requires an index, there's no stored STRICT_SPHERE shape. - // So we don't check it here. + CRS queryCRS = nearParams.nearQuery->centroid->crs; - // NOTE: For now, we're sure that if we get this far in the query we'll have an - // appropriate index which validates the type of geometry we're pulling back here. - // TODO: It may make sense to change our semantics and, by default, only return - // shapes in the same CRS from $geoNear. - if (!stored.geometry.supportsProject(queryCRS)) - continue; - stored.geometry.projectInto(queryCRS); + // Extract all the geometries out of this document for the near query + OwnedPointerVector<StoredGeometry> geometriesOwned; + vector<StoredGeometry*>& geometries = geometriesOwned.mutableVector(); + extractGeometries(member->obj.value(), nearParams.nearQuery->field, &geometries); - double nextDistance = stored.geometry.minDistance(*nearParams.nearQuery->centroid); + // Compute the minimum distance of all the geometries in the document + double minDistance = -1; + BSONObj minDistanceObj; + for (vector<StoredGeometry*>::iterator it = geometries.begin(); it != geometries.end(); ++it) { + StoredGeometry& stored = **it; - if (minDistance < 0 || nextDistance < minDistance) { - minDistance = nextDistance; - minDistanceObj = stored.element.Obj(); - } - } + // NOTE: A stored document with STRICT_SPHERE CRS is treated as a malformed document + // and ignored. Since GeoNear requires an index, there's no stored STRICT_SPHERE shape. + // So we don't check it here. - if (minDistance < 0) { - // No distance to report - return StatusWith<double>(-1); - } + // NOTE: For now, we're sure that if we get this far in the query we'll have an + // appropriate index which validates the type of geometry we're pulling back here. + // TODO: It may make sense to change our semantics and, by default, only return + // shapes in the same CRS from $geoNear. + if (!stored.geometry.supportsProject(queryCRS)) + continue; + stored.geometry.projectInto(queryCRS); - if (nearParams.addDistMeta) { - if (nearParams.nearQuery->unitsAreRadians) { - // Hack for nearSphere - // TODO: Remove nearSphere? - invariant(SPHERE == queryCRS); - member->addComputed(new GeoDistanceComputedData(minDistance - / kRadiusOfEarthInMeters)); - } - else { - member->addComputed(new GeoDistanceComputedData(minDistance)); - } - } + double nextDistance = stored.geometry.minDistance(*nearParams.nearQuery->centroid); - if (nearParams.addPointMeta) { - member->addComputed(new GeoNearPointComputedData(minDistanceObj)); + if (minDistance < 0 || nextDistance < minDistance) { + minDistance = nextDistance; + minDistanceObj = stored.element.Obj(); } - - return StatusWith<double>(minDistance); } - static R2Annulus geoNearDistanceBounds(const GeoNearExpression& query) { - - const CRS queryCRS = query.centroid->crs; - - if (FLAT == queryCRS) { - return R2Annulus(query.centroid->oldPoint, query.minDistance, query.maxDistance); - } - - invariant(SPHERE == queryCRS); - - // TODO: Tighten this up a bit by making a CRS for "sphere with radians" - double minDistance = query.minDistance; - double maxDistance = query.maxDistance; + if (minDistance < 0) { + // No distance to report + return StatusWith<double>(-1); + } - if (query.unitsAreRadians) { - // Our input bounds are in radians, convert to meters since the query CRS is actually - // SPHERE. We'll convert back to radians on outputting distances. - minDistance *= kRadiusOfEarthInMeters; - maxDistance *= kRadiusOfEarthInMeters; + if (nearParams.addDistMeta) { + if (nearParams.nearQuery->unitsAreRadians) { + // Hack for nearSphere + // TODO: Remove nearSphere? + invariant(SPHERE == queryCRS); + member->addComputed(new GeoDistanceComputedData(minDistance / kRadiusOfEarthInMeters)); + } else { + member->addComputed(new GeoDistanceComputedData(minDistance)); } - - // GOTCHA: oldPoint is a misnomer - it is the original point data and is in the correct - // CRS. We must not try to derive the original point from the spherical S2Point generated - // as an optimization - the mapping is not 1->1 - [-180, 0] and [180, 0] map to the same - // place. - // TODO: Wrapping behavior should not depend on the index, which would make $near code - // insensitive to which direction we explore the index in. - return R2Annulus(query.centroid->oldPoint, - min(minDistance, kMaxEarthDistanceInMeters), - min(maxDistance, kMaxEarthDistanceInMeters)); } - // - // GeoNear2DStage - // - - static R2Annulus twoDDistanceBounds(const GeoNearParams& nearParams, - const IndexDescriptor* twoDIndex) { + if (nearParams.addPointMeta) { + member->addComputed(new GeoNearPointComputedData(minDistanceObj)); + } - R2Annulus fullBounds = geoNearDistanceBounds(*nearParams.nearQuery); - const CRS queryCRS = nearParams.nearQuery->centroid->crs; + return StatusWith<double>(minDistance); +} - if (FLAT == queryCRS) { +static R2Annulus geoNearDistanceBounds(const GeoNearExpression& query) { + const CRS queryCRS = query.centroid->crs; - // Reset the full bounds based on our index bounds - GeoHashConverter::Parameters hashParams; - Status status = GeoHashConverter::parseParameters(twoDIndex->infoObj(), &hashParams); - invariant(status.isOK()); // The index status should always be valid + if (FLAT == queryCRS) { + return R2Annulus(query.centroid->oldPoint, query.minDistance, query.maxDistance); + } - // The biggest distance possible in this indexed collection is the diagonal of the - // square indexed region. - const double sqrt2Approx = 1.5; - const double diagonalDist = sqrt2Approx * (hashParams.max - hashParams.min); + invariant(SPHERE == queryCRS); - fullBounds = R2Annulus(fullBounds.center(), - fullBounds.getInner(), - min(fullBounds.getOuter(), diagonalDist)); - } - else { - // Spherical queries have upper bounds set by the earth - no-op - // TODO: Wrapping errors would creep in here if nearSphere wasn't defined to not wrap - invariant(SPHERE == queryCRS); - invariant(!nearParams.nearQuery->isWrappingQuery); - } + // TODO: Tighten this up a bit by making a CRS for "sphere with radians" + double minDistance = query.minDistance; + double maxDistance = query.maxDistance; - return fullBounds; + if (query.unitsAreRadians) { + // Our input bounds are in radians, convert to meters since the query CRS is actually + // SPHERE. We'll convert back to radians on outputting distances. + minDistance *= kRadiusOfEarthInMeters; + maxDistance *= kRadiusOfEarthInMeters; } - class GeoNear2DStage::DensityEstimator { - public: - DensityEstimator(const IndexDescriptor* twoDindex, const GeoNearParams* nearParams) : - _twoDIndex(twoDindex), _nearParams(nearParams), _currentLevel(0) - { - GeoHashConverter::Parameters hashParams; - Status status = GeoHashConverter::parseParameters(_twoDIndex->infoObj(), - &hashParams); - // The index status should always be valid. - invariant(status.isOK()); - - _converter.reset(new GeoHashConverter(hashParams)); - _centroidCell = _converter->hash(_nearParams->nearQuery->centroid->oldPoint); - - // Since appendVertexNeighbors(level, output) requires level < hash.getBits(), - // we have to start to find documents at most GeoHash::kMaxBits - 1. Thus the finest - // search area is 16 * finest cell area at GeoHash::kMaxBits. - _currentLevel = std::max(0u, hashParams.bits - 1u); - } + // GOTCHA: oldPoint is a misnomer - it is the original point data and is in the correct + // CRS. We must not try to derive the original point from the spherical S2Point generated + // as an optimization - the mapping is not 1->1 - [-180, 0] and [180, 0] map to the same + // place. + // TODO: Wrapping behavior should not depend on the index, which would make $near code + // insensitive to which direction we explore the index in. + return R2Annulus(query.centroid->oldPoint, + min(minDistance, kMaxEarthDistanceInMeters), + min(maxDistance, kMaxEarthDistanceInMeters)); +} + +// +// GeoNear2DStage +// + +static R2Annulus twoDDistanceBounds(const GeoNearParams& nearParams, + const IndexDescriptor* twoDIndex) { + R2Annulus fullBounds = geoNearDistanceBounds(*nearParams.nearQuery); + const CRS queryCRS = nearParams.nearQuery->centroid->crs; + + if (FLAT == queryCRS) { + // Reset the full bounds based on our index bounds + GeoHashConverter::Parameters hashParams; + Status status = GeoHashConverter::parseParameters(twoDIndex->infoObj(), &hashParams); + invariant(status.isOK()); // The index status should always be valid + + // The biggest distance possible in this indexed collection is the diagonal of the + // square indexed region. + const double sqrt2Approx = 1.5; + const double diagonalDist = sqrt2Approx * (hashParams.max - hashParams.min); + + fullBounds = R2Annulus( + fullBounds.center(), fullBounds.getInner(), min(fullBounds.getOuter(), diagonalDist)); + } else { + // Spherical queries have upper bounds set by the earth - no-op + // TODO: Wrapping errors would creep in here if nearSphere wasn't defined to not wrap + invariant(SPHERE == queryCRS); + invariant(!nearParams.nearQuery->isWrappingQuery); + } - PlanStage::StageState work(OperationContext* txn, - WorkingSet* workingSet, - Collection* collection, - WorkingSetID* out, - double* estimatedDistance); - - void saveState(); - void restoreState(OperationContext* txn); - void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - - private: - void buildIndexScan(OperationContext* txn, WorkingSet* workingSet, Collection* collection); - - const IndexDescriptor* _twoDIndex; // Not owned here. - const GeoNearParams* _nearParams; // Not owned here. - unique_ptr<IndexScan> _indexScan; - unique_ptr<GeoHashConverter> _converter; - GeoHash _centroidCell; - unsigned _currentLevel; - }; - - // Initialize the internal states - void GeoNear2DStage::DensityEstimator::buildIndexScan(OperationContext* txn, - WorkingSet* workingSet, - Collection* collection) - { - IndexScanParams scanParams; - scanParams.descriptor = _twoDIndex; - scanParams.direction = 1; - scanParams.doNotDedup = true; - - // Scan bounds on 2D indexes are only over the 2D field - other bounds aren't applicable. - // This is handled in query planning. - scanParams.bounds = _nearParams->baseBounds; - - // The "2d" field is always the first in the index - const string twoDFieldName = _nearParams->nearQuery->field; - const int twoDFieldPosition = 0; - - // Construct index intervals used by this stage - OrderedIntervalList oil; - oil.name = scanParams.bounds.fields[twoDFieldPosition].name; - - vector<GeoHash> neighbors; - // Return the neighbors of closest vertex to this cell at the given level. - _centroidCell.appendVertexNeighbors(_currentLevel, &neighbors); - std::sort(neighbors.begin(), neighbors.end()); - - for (vector<GeoHash>::const_iterator it = neighbors.begin(); it != neighbors.end(); it++) { - mongo::BSONObjBuilder builder; - it->appendHashMin(&builder, ""); - it->appendHashMax(&builder, ""); - oil.intervals.push_back(IndexBoundsBuilder::makeRangeInterval(builder.obj(), - true, - true)); - } + return fullBounds; +} - invariant(oil.isValidFor(1)); +class GeoNear2DStage::DensityEstimator { +public: + DensityEstimator(const IndexDescriptor* twoDindex, const GeoNearParams* nearParams) + : _twoDIndex(twoDindex), _nearParams(nearParams), _currentLevel(0) { + GeoHashConverter::Parameters hashParams; + Status status = GeoHashConverter::parseParameters(_twoDIndex->infoObj(), &hashParams); + // The index status should always be valid. + invariant(status.isOK()); - // Intersect the $near bounds we just generated into the bounds we have for anything else - // in the scan (i.e. $within) - IndexBoundsBuilder::intersectize(oil, - &scanParams.bounds.fields[twoDFieldPosition]); + _converter.reset(new GeoHashConverter(hashParams)); + _centroidCell = _converter->hash(_nearParams->nearQuery->centroid->oldPoint); - _indexScan.reset(new IndexScan(txn, scanParams, workingSet, NULL)); + // Since appendVertexNeighbors(level, output) requires level < hash.getBits(), + // we have to start to find documents at most GeoHash::kMaxBits - 1. Thus the finest + // search area is 16 * finest cell area at GeoHash::kMaxBits. + _currentLevel = std::max(0u, hashParams.bits - 1u); } - // Return IS_EOF is we find a document in it's ancestor cells and set estimated distance - // from the nearest document. - PlanStage::StageState GeoNear2DStage::DensityEstimator::work(OperationContext* txn, - WorkingSet* workingSet, - Collection* collection, - WorkingSetID* out, - double* estimatedDistance) - { - if (!_indexScan) { - // Setup index scan stage for current level. - buildIndexScan(txn, workingSet, collection); - } + PlanStage::StageState work(OperationContext* txn, + WorkingSet* workingSet, + Collection* collection, + WorkingSetID* out, + double* estimatedDistance); + + void saveState(); + void restoreState(OperationContext* txn); + void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + +private: + void buildIndexScan(OperationContext* txn, WorkingSet* workingSet, Collection* collection); + + const IndexDescriptor* _twoDIndex; // Not owned here. + const GeoNearParams* _nearParams; // Not owned here. + unique_ptr<IndexScan> _indexScan; + unique_ptr<GeoHashConverter> _converter; + GeoHash _centroidCell; + unsigned _currentLevel; +}; + +// Initialize the internal states +void GeoNear2DStage::DensityEstimator::buildIndexScan(OperationContext* txn, + WorkingSet* workingSet, + Collection* collection) { + IndexScanParams scanParams; + scanParams.descriptor = _twoDIndex; + scanParams.direction = 1; + scanParams.doNotDedup = true; + + // Scan bounds on 2D indexes are only over the 2D field - other bounds aren't applicable. + // This is handled in query planning. + scanParams.bounds = _nearParams->baseBounds; + + // The "2d" field is always the first in the index + const string twoDFieldName = _nearParams->nearQuery->field; + const int twoDFieldPosition = 0; + + // Construct index intervals used by this stage + OrderedIntervalList oil; + oil.name = scanParams.bounds.fields[twoDFieldPosition].name; + + vector<GeoHash> neighbors; + // Return the neighbors of closest vertex to this cell at the given level. + _centroidCell.appendVertexNeighbors(_currentLevel, &neighbors); + std::sort(neighbors.begin(), neighbors.end()); + + for (vector<GeoHash>::const_iterator it = neighbors.begin(); it != neighbors.end(); it++) { + mongo::BSONObjBuilder builder; + it->appendHashMin(&builder, ""); + it->appendHashMax(&builder, ""); + oil.intervals.push_back(IndexBoundsBuilder::makeRangeInterval(builder.obj(), true, true)); + } - WorkingSetID workingSetID; - PlanStage::StageState state = _indexScan->work(&workingSetID); - - if (state == PlanStage::IS_EOF) { - // We ran through the neighbors but found nothing. - if (_currentLevel > 0u) { - // Advance to the next level and search again. - _currentLevel--; - // Reset index scan for the next level. - _indexScan.reset(NULL); - return PlanStage::NEED_TIME; - } + invariant(oil.isValidFor(1)); + + // Intersect the $near bounds we just generated into the bounds we have for anything else + // in the scan (i.e. $within) + IndexBoundsBuilder::intersectize(oil, &scanParams.bounds.fields[twoDFieldPosition]); + + _indexScan.reset(new IndexScan(txn, scanParams, workingSet, NULL)); +} + +// Return IS_EOF is we find a document in it's ancestor cells and set estimated distance +// from the nearest document. +PlanStage::StageState GeoNear2DStage::DensityEstimator::work(OperationContext* txn, + WorkingSet* workingSet, + Collection* collection, + WorkingSetID* out, + double* estimatedDistance) { + if (!_indexScan) { + // Setup index scan stage for current level. + buildIndexScan(txn, workingSet, collection); + } - // We are already at the top level. - *estimatedDistance = _converter->sizeEdge(_currentLevel); - return PlanStage::IS_EOF; - } else if (state == PlanStage::ADVANCED) { - // Found a document at current level. - *estimatedDistance = _converter->sizeEdge(_currentLevel); - // Clean up working set. - workingSet->free(workingSetID); - return PlanStage::IS_EOF; - } else if (state == PlanStage::NEED_YIELD) { - *out = workingSetID; + WorkingSetID workingSetID; + PlanStage::StageState state = _indexScan->work(&workingSetID); + + if (state == PlanStage::IS_EOF) { + // We ran through the neighbors but found nothing. + if (_currentLevel > 0u) { + // Advance to the next level and search again. + _currentLevel--; + // Reset index scan for the next level. + _indexScan.reset(NULL); + return PlanStage::NEED_TIME; } - // Propagate NEED_TIME or errors - return state; + // We are already at the top level. + *estimatedDistance = _converter->sizeEdge(_currentLevel); + return PlanStage::IS_EOF; + } else if (state == PlanStage::ADVANCED) { + // Found a document at current level. + *estimatedDistance = _converter->sizeEdge(_currentLevel); + // Clean up working set. + workingSet->free(workingSetID); + return PlanStage::IS_EOF; + } else if (state == PlanStage::NEED_YIELD) { + *out = workingSetID; } - void GeoNear2DStage::DensityEstimator::saveState() { - if (_indexScan) { - _indexScan->saveState(); - } + // Propagate NEED_TIME or errors + return state; +} + +void GeoNear2DStage::DensityEstimator::saveState() { + if (_indexScan) { + _indexScan->saveState(); } +} - void GeoNear2DStage::DensityEstimator::restoreState(OperationContext* txn) { - if (_indexScan) { - _indexScan->restoreState(txn); - } +void GeoNear2DStage::DensityEstimator::restoreState(OperationContext* txn) { + if (_indexScan) { + _indexScan->restoreState(txn); } +} - void GeoNear2DStage::DensityEstimator::invalidate(OperationContext* txn, - const RecordId& dl, - InvalidationType type) { - if (_indexScan) { - _indexScan->invalidate(txn, dl, type); - } +void GeoNear2DStage::DensityEstimator::invalidate(OperationContext* txn, + const RecordId& dl, + InvalidationType type) { + if (_indexScan) { + _indexScan->invalidate(txn, dl, type); } +} - PlanStage::StageState GeoNear2DStage::initialize(OperationContext* txn, - WorkingSet* workingSet, - Collection* collection, - WorkingSetID* out) - { - if (!_densityEstimator) { - _densityEstimator.reset(new DensityEstimator(_twoDIndex, &_nearParams)); - } +PlanStage::StageState GeoNear2DStage::initialize(OperationContext* txn, + WorkingSet* workingSet, + Collection* collection, + WorkingSetID* out) { + if (!_densityEstimator) { + _densityEstimator.reset(new DensityEstimator(_twoDIndex, &_nearParams)); + } - double estimatedDistance; - PlanStage::StageState state = _densityEstimator->work(txn, workingSet, collection, out, - &estimatedDistance); - - if (state == PlanStage::IS_EOF) { - // 2d index only works with legacy points as centroid. $nearSphere will project - // the point into SPHERE CRS and calculate distance based on that. - // STRICT_SPHERE is impossible here, as GeoJSON centroid is not allowed for 2d index. - - // Estimator finished its work, we need to finish initialization too. - if (SPHERE == _nearParams.nearQuery->centroid->crs) { - // Estimated distance is in degrees, convert it to meters. - _boundsIncrement = deg2rad(estimatedDistance) * kRadiusOfEarthInMeters * 3; - // Limit boundsIncrement to ~20KM, so that the first circle won't be too aggressive. - _boundsIncrement = std::min(_boundsIncrement, kMaxEarthDistanceInMeters / 1000.0); - } - else { - // We expand the radius by 3 times to give a reasonable starting search area. - // Assume points are distributed evenly. X is the edge size of cells at whose - // level we found a document in 4 neighbors. Thus the closest point is at least - // X/2 far from the centroid. The distance between two points is at least X. - // The area of Pi * (3X)^2 ~= 28 * X^2 will cover dozens of points at most. - // We'll explore the space with exponentially increasing radius if this guess is - // too small, so starting from a conservative initial radius doesn't hurt. - - _boundsIncrement = 3 * estimatedDistance; - } - invariant(_boundsIncrement > 0.0); + double estimatedDistance; + PlanStage::StageState state = + _densityEstimator->work(txn, workingSet, collection, out, &estimatedDistance); + + if (state == PlanStage::IS_EOF) { + // 2d index only works with legacy points as centroid. $nearSphere will project + // the point into SPHERE CRS and calculate distance based on that. + // STRICT_SPHERE is impossible here, as GeoJSON centroid is not allowed for 2d index. + + // Estimator finished its work, we need to finish initialization too. + if (SPHERE == _nearParams.nearQuery->centroid->crs) { + // Estimated distance is in degrees, convert it to meters. + _boundsIncrement = deg2rad(estimatedDistance) * kRadiusOfEarthInMeters * 3; + // Limit boundsIncrement to ~20KM, so that the first circle won't be too aggressive. + _boundsIncrement = std::min(_boundsIncrement, kMaxEarthDistanceInMeters / 1000.0); + } else { + // We expand the radius by 3 times to give a reasonable starting search area. + // Assume points are distributed evenly. X is the edge size of cells at whose + // level we found a document in 4 neighbors. Thus the closest point is at least + // X/2 far from the centroid. The distance between two points is at least X. + // The area of Pi * (3X)^2 ~= 28 * X^2 will cover dozens of points at most. + // We'll explore the space with exponentially increasing radius if this guess is + // too small, so starting from a conservative initial radius doesn't hurt. - // Clean up - _densityEstimator.reset(NULL); + _boundsIncrement = 3 * estimatedDistance; } + invariant(_boundsIncrement > 0.0); - return state; + // Clean up + _densityEstimator.reset(NULL); } - static const string kTwoDIndexNearStage("GEO_NEAR_2D"); - - GeoNear2DStage::GeoNear2DStage(const GeoNearParams& nearParams, - OperationContext* txn, - WorkingSet* workingSet, - Collection* collection, - IndexDescriptor* twoDIndex) - : NearStage(txn, - workingSet, - collection, - new PlanStageStats(CommonStats(kTwoDIndexNearStage.c_str()), - STAGE_GEO_NEAR_2D)), - _nearParams(nearParams), - _twoDIndex(twoDIndex), - _fullBounds(twoDDistanceBounds(nearParams, twoDIndex)), - _currBounds(_fullBounds.center(), -1, _fullBounds.getInner()), - _boundsIncrement(0.0) { - - getNearStats()->keyPattern = twoDIndex->keyPattern(); - getNearStats()->indexName = twoDIndex->indexName(); - } + return state; +} - GeoNear2DStage::~GeoNear2DStage() { - } +static const string kTwoDIndexNearStage("GEO_NEAR_2D"); - void GeoNear2DStage::finishSaveState() { - if (_densityEstimator) { - _densityEstimator->saveState(); - } +GeoNear2DStage::GeoNear2DStage(const GeoNearParams& nearParams, + OperationContext* txn, + WorkingSet* workingSet, + Collection* collection, + IndexDescriptor* twoDIndex) + : NearStage(txn, + workingSet, + collection, + new PlanStageStats(CommonStats(kTwoDIndexNearStage.c_str()), STAGE_GEO_NEAR_2D)), + _nearParams(nearParams), + _twoDIndex(twoDIndex), + _fullBounds(twoDDistanceBounds(nearParams, twoDIndex)), + _currBounds(_fullBounds.center(), -1, _fullBounds.getInner()), + _boundsIncrement(0.0) { + getNearStats()->keyPattern = twoDIndex->keyPattern(); + getNearStats()->indexName = twoDIndex->indexName(); +} + +GeoNear2DStage::~GeoNear2DStage() {} + +void GeoNear2DStage::finishSaveState() { + if (_densityEstimator) { + _densityEstimator->saveState(); } +} - void GeoNear2DStage::finishRestoreState(OperationContext* txn) { - if (_densityEstimator) { - _densityEstimator->restoreState(txn); - } +void GeoNear2DStage::finishRestoreState(OperationContext* txn) { + if (_densityEstimator) { + _densityEstimator->restoreState(txn); } +} - void GeoNear2DStage::finishInvalidate(OperationContext* txn, - const RecordId& dl, - InvalidationType type) { - if (_densityEstimator) { - _densityEstimator->invalidate(txn, dl, type); - } +void GeoNear2DStage::finishInvalidate(OperationContext* txn, + const RecordId& dl, + InvalidationType type) { + if (_densityEstimator) { + _densityEstimator->invalidate(txn, dl, type); } +} - namespace { - - /** - * Expression which checks whether a legacy 2D index point is contained within our near - * search annulus. See nextInterval() below for more discussion. - * TODO: Make this a standard type of GEO match expression - */ - class TwoDPtInAnnulusExpression : public LeafMatchExpression { - public: - - TwoDPtInAnnulusExpression(const R2Annulus& annulus, StringData twoDPath) - : LeafMatchExpression(INTERNAL_2D_POINT_IN_ANNULUS), _annulus(annulus) { - - initPath(twoDPath); - } - - virtual ~TwoDPtInAnnulusExpression() { - } - - virtual void toBSON(BSONObjBuilder* out) const { - out->append("TwoDPtInAnnulusExpression", true); - } - - virtual bool matchesSingleElement(const BSONElement& e) const { - if (!e.isABSONObj()) - return false; - - PointWithCRS point; - if (!GeoParser::parseStoredPoint(e, &point).isOK()) return false; - - return _annulus.contains(point.oldPoint); - } - - // - // These won't be called. - // - - virtual void debugString(StringBuilder& debug, int level = 0) const { - invariant(false); - } - - virtual bool equivalent(const MatchExpression* other) const { - invariant(false); - return false; - } - - virtual LeafMatchExpression* shallowClone() const { - invariant(false); - return NULL; - } - - private: +namespace { - R2Annulus _annulus; - }; - - /** - * Expression which checks whether a 2D key for a point (2D hash) intersects our search - * region. The search region may have been formed by more granular hashes. - */ - class TwoDKeyInRegionExpression : public LeafMatchExpression { - public: - - TwoDKeyInRegionExpression(R2Region* region, - const GeoHashConverter::Parameters& hashParams, - StringData twoDKeyPath) - : LeafMatchExpression(INTERNAL_2D_KEY_IN_REGION), - _region(region), - _unhasher(hashParams) { - - initPath(twoDKeyPath); - } - - virtual ~TwoDKeyInRegionExpression() { - } - - virtual void toBSON(BSONObjBuilder* out) const { - out->append("TwoDKeyInRegionExpression", true); - } - - virtual bool matchesSingleElement(const BSONElement& e) const { - // Something has gone terribly wrong if this doesn't hold. - invariant(BinData == e.type()); - return !_region->fastDisjoint(_unhasher.unhashToBoxCovering(_unhasher.hash(e))); - } - - // - // These won't be called. - // - - virtual void debugString(StringBuilder& debug, int level = 0) const { - invariant(false); - } - - virtual bool equivalent(const MatchExpression* other) const { - invariant(false); - return true; - } - - virtual MatchExpression* shallowClone() const { - invariant(false); - return NULL; - } +/** + * Expression which checks whether a legacy 2D index point is contained within our near + * search annulus. See nextInterval() below for more discussion. + * TODO: Make this a standard type of GEO match expression + */ +class TwoDPtInAnnulusExpression : public LeafMatchExpression { +public: + TwoDPtInAnnulusExpression(const R2Annulus& annulus, StringData twoDPath) + : LeafMatchExpression(INTERNAL_2D_POINT_IN_ANNULUS), _annulus(annulus) { + initPath(twoDPath); + } - private: + virtual ~TwoDPtInAnnulusExpression() {} - const unique_ptr<R2Region> _region; - const GeoHashConverter _unhasher; - }; + virtual void toBSON(BSONObjBuilder* out) const { + out->append("TwoDPtInAnnulusExpression", true); + } - // Helper class to maintain ownership of a match expression alongside an index scan - class IndexScanWithMatch : public IndexScan { - public: + virtual bool matchesSingleElement(const BSONElement& e) const { + if (!e.isABSONObj()) + return false; - IndexScanWithMatch(OperationContext* txn, - const IndexScanParams& params, - WorkingSet* workingSet, - MatchExpression* filter) - : IndexScan(txn, params, workingSet, filter), _matcher(filter) { - } + PointWithCRS point; + if (!GeoParser::parseStoredPoint(e, &point).isOK()) + return false; - virtual ~IndexScanWithMatch() { - } - - private: + return _annulus.contains(point.oldPoint); + } - // Owns matcher - const unique_ptr<MatchExpression> _matcher; - }; + // + // These won't be called. + // - // Helper class to maintain ownership of a match expression alongside an index scan - class FetchStageWithMatch : public FetchStage { - public: + virtual void debugString(StringBuilder& debug, int level = 0) const { + invariant(false); + } - FetchStageWithMatch(OperationContext* txn, - WorkingSet* ws, - PlanStage* child, - MatchExpression* filter, - const Collection* collection) - : FetchStage(txn, ws, child, filter, collection), _matcher(filter) { - } + virtual bool equivalent(const MatchExpression* other) const { + invariant(false); + return false; + } - virtual ~FetchStageWithMatch() { - } + virtual LeafMatchExpression* shallowClone() const { + invariant(false); + return NULL; + } - private: +private: + R2Annulus _annulus; +}; - // Owns matcher - const unique_ptr<MatchExpression> _matcher; - }; +/** + * Expression which checks whether a 2D key for a point (2D hash) intersects our search + * region. The search region may have been formed by more granular hashes. + */ +class TwoDKeyInRegionExpression : public LeafMatchExpression { +public: + TwoDKeyInRegionExpression(R2Region* region, + const GeoHashConverter::Parameters& hashParams, + StringData twoDKeyPath) + : LeafMatchExpression(INTERNAL_2D_KEY_IN_REGION), _region(region), _unhasher(hashParams) { + initPath(twoDKeyPath); } - static double min2DBoundsIncrement(const GeoNearExpression& query, IndexDescriptor* twoDIndex) { - GeoHashConverter::Parameters hashParams; - Status status = GeoHashConverter::parseParameters(twoDIndex->infoObj(), &hashParams); - invariant(status.isOK()); // The index status should always be valid - GeoHashConverter hasher(hashParams); + virtual ~TwoDKeyInRegionExpression() {} - // The hasher error is the diagonal of a 2D hash region - it's generally not helpful - // to change region size such that a search radius is smaller than the 2D hash region - // max radius. This is slightly conservative for now (box diagonal vs circle radius). - double minBoundsIncrement = hasher.getError() / 2; + virtual void toBSON(BSONObjBuilder* out) const { + out->append("TwoDKeyInRegionExpression", true); + } - const CRS queryCRS = query.centroid->crs; - if (FLAT == queryCRS) - return minBoundsIncrement; + virtual bool matchesSingleElement(const BSONElement& e) const { + // Something has gone terribly wrong if this doesn't hold. + invariant(BinData == e.type()); + return !_region->fastDisjoint(_unhasher.unhashToBoxCovering(_unhasher.hash(e))); + } - invariant(SPHERE == queryCRS); + // + // These won't be called. + // - // If this is a spherical query, units are in meters - this is just a heuristic - return minBoundsIncrement * kMetersPerDegreeAtEquator; + virtual void debugString(StringBuilder& debug, int level = 0) const { + invariant(false); } - static R2Annulus projectBoundsToTwoDDegrees(R2Annulus sphereBounds) { - - const double outerDegrees = rad2deg(sphereBounds.getOuter() / kRadiusOfEarthInMeters); - const double innerDegrees = rad2deg(sphereBounds.getInner() / kRadiusOfEarthInMeters); - const double maxErrorDegrees = computeXScanDistance(sphereBounds.center().y, outerDegrees); + virtual bool equivalent(const MatchExpression* other) const { + invariant(false); + return true; + } - return R2Annulus(sphereBounds.center(), - max(0.0, innerDegrees - maxErrorDegrees), - outerDegrees + maxErrorDegrees); + virtual MatchExpression* shallowClone() const { + invariant(false); + return NULL; } - StatusWith<NearStage::CoveredInterval*> // +private: + const unique_ptr<R2Region> _region; + const GeoHashConverter _unhasher; +}; + +// Helper class to maintain ownership of a match expression alongside an index scan +class IndexScanWithMatch : public IndexScan { +public: + IndexScanWithMatch(OperationContext* txn, + const IndexScanParams& params, + WorkingSet* workingSet, + MatchExpression* filter) + : IndexScan(txn, params, workingSet, filter), _matcher(filter) {} + + virtual ~IndexScanWithMatch() {} + +private: + // Owns matcher + const unique_ptr<MatchExpression> _matcher; +}; + +// Helper class to maintain ownership of a match expression alongside an index scan +class FetchStageWithMatch : public FetchStage { +public: + FetchStageWithMatch(OperationContext* txn, + WorkingSet* ws, + PlanStage* child, + MatchExpression* filter, + const Collection* collection) + : FetchStage(txn, ws, child, filter, collection), _matcher(filter) {} + + virtual ~FetchStageWithMatch() {} + +private: + // Owns matcher + const unique_ptr<MatchExpression> _matcher; +}; +} + +static double min2DBoundsIncrement(const GeoNearExpression& query, IndexDescriptor* twoDIndex) { + GeoHashConverter::Parameters hashParams; + Status status = GeoHashConverter::parseParameters(twoDIndex->infoObj(), &hashParams); + invariant(status.isOK()); // The index status should always be valid + GeoHashConverter hasher(hashParams); + + // The hasher error is the diagonal of a 2D hash region - it's generally not helpful + // to change region size such that a search radius is smaller than the 2D hash region + // max radius. This is slightly conservative for now (box diagonal vs circle radius). + double minBoundsIncrement = hasher.getError() / 2; + + const CRS queryCRS = query.centroid->crs; + if (FLAT == queryCRS) + return minBoundsIncrement; + + invariant(SPHERE == queryCRS); + + // If this is a spherical query, units are in meters - this is just a heuristic + return minBoundsIncrement * kMetersPerDegreeAtEquator; +} + +static R2Annulus projectBoundsToTwoDDegrees(R2Annulus sphereBounds) { + const double outerDegrees = rad2deg(sphereBounds.getOuter() / kRadiusOfEarthInMeters); + const double innerDegrees = rad2deg(sphereBounds.getInner() / kRadiusOfEarthInMeters); + const double maxErrorDegrees = computeXScanDistance(sphereBounds.center().y, outerDegrees); + + return R2Annulus(sphereBounds.center(), + max(0.0, innerDegrees - maxErrorDegrees), + outerDegrees + maxErrorDegrees); +} + +StatusWith<NearStage::CoveredInterval*> // GeoNear2DStage::nextInterval(OperationContext* txn, WorkingSet* workingSet, Collection* collection) { + // The search is finished if we searched at least once and all the way to the edge + if (_currBounds.getInner() >= 0 && _currBounds.getOuter() == _fullBounds.getOuter()) { + return StatusWith<CoveredInterval*>(NULL); + } - // The search is finished if we searched at least once and all the way to the edge - if (_currBounds.getInner() >= 0 && _currBounds.getOuter() == _fullBounds.getOuter()) { - return StatusWith<CoveredInterval*>(NULL); - } - - // - // Setup the next interval - // - - const NearStats* stats = getNearStats(); - - if (!stats->intervalStats.empty()) { - - const IntervalStats& lastIntervalStats = stats->intervalStats.back(); + // + // Setup the next interval + // - // TODO: Generally we want small numbers of results fast, then larger numbers later - if (lastIntervalStats.numResultsBuffered < 300) - _boundsIncrement *= 2; - else if (lastIntervalStats.numResultsBuffered > 600) - _boundsIncrement /= 2; - } + const NearStats* stats = getNearStats(); - _boundsIncrement = max(_boundsIncrement, - min2DBoundsIncrement(*_nearParams.nearQuery, _twoDIndex)); + if (!stats->intervalStats.empty()) { + const IntervalStats& lastIntervalStats = stats->intervalStats.back(); - R2Annulus nextBounds(_currBounds.center(), - _currBounds.getOuter(), - min(_currBounds.getOuter() + _boundsIncrement, - _fullBounds.getOuter())); + // TODO: Generally we want small numbers of results fast, then larger numbers later + if (lastIntervalStats.numResultsBuffered < 300) + _boundsIncrement *= 2; + else if (lastIntervalStats.numResultsBuffered > 600) + _boundsIncrement /= 2; + } - const bool isLastInterval = (nextBounds.getOuter() == _fullBounds.getOuter()); - _currBounds = nextBounds; + _boundsIncrement = + max(_boundsIncrement, min2DBoundsIncrement(*_nearParams.nearQuery, _twoDIndex)); - // - // Get a covering region for this interval - // + R2Annulus nextBounds(_currBounds.center(), + _currBounds.getOuter(), + min(_currBounds.getOuter() + _boundsIncrement, _fullBounds.getOuter())); - const CRS queryCRS = _nearParams.nearQuery->centroid->crs; - - unique_ptr<R2Region> coverRegion; - - if (FLAT == queryCRS) { - - // NOTE: Due to floating point math issues, FLAT searches of a 2D index need to treat - // containment and distance separately. - // Ex: (distance) 54.001 - 54 > 0.001, but (containment) 54 + 0.001 <= 54.001 - // The idea is that a $near search with bounds is really a $within search, sorted by - // distance. We attach a custom $within : annulus matcher to do the $within search, - // and adjust max/min bounds slightly since, as above, containment does not mean the - // distance calculation won't slightly overflow the boundary. - // - // The code below adjusts: - // 1) Overall min/max bounds of the generated distance intervals to be more inclusive - // 2) Bounds of the interval covering to be more inclusive - // ... and later on we add the custom $within : annulus matcher. - // - // IMPORTANT: The *internal* interval distance bounds are *exact thresholds* - these - // should not be adjusted. - // TODO: Maybe integrate annuluses as a standard shape, and literally transform $near - // internally into a $within query with $near just as sort. - - // Compute the maximum axis-aligned distance error - const double epsilon = std::numeric_limits<double>::epsilon() - * (max(abs(_fullBounds.center().x), abs(_fullBounds.center().y)) - + _fullBounds.getOuter()); - - if (nextBounds.getInner() > 0 && nextBounds.getInner() == _fullBounds.getInner()) { - nextBounds = R2Annulus(nextBounds.center(), - max(0.0, nextBounds.getInner() - epsilon), - nextBounds.getOuter()); - } + const bool isLastInterval = (nextBounds.getOuter() == _fullBounds.getOuter()); + _currBounds = nextBounds; - if (nextBounds.getOuter() > 0 && nextBounds.getOuter() == _fullBounds.getOuter()) { - // We're at the max bound of the search, adjust interval maximum - nextBounds = R2Annulus(nextBounds.center(), - nextBounds.getInner(), - nextBounds.getOuter() + epsilon); - } + // + // Get a covering region for this interval + // - // *Always* adjust the covering bounds to be more inclusive - coverRegion.reset(new R2Annulus(nextBounds.center(), - max(0.0, nextBounds.getInner() - epsilon), - nextBounds.getOuter() + epsilon)); - } - else { - invariant(SPHERE == queryCRS); - // TODO: As above, make this consistent with $within : $centerSphere + const CRS queryCRS = _nearParams.nearQuery->centroid->crs; - // Our intervals aren't in the same CRS as our index, so we need to adjust them - coverRegion.reset(new R2Annulus(projectBoundsToTwoDDegrees(nextBounds))); - } + unique_ptr<R2Region> coverRegion; + if (FLAT == queryCRS) { + // NOTE: Due to floating point math issues, FLAT searches of a 2D index need to treat + // containment and distance separately. + // Ex: (distance) 54.001 - 54 > 0.001, but (containment) 54 + 0.001 <= 54.001 + // The idea is that a $near search with bounds is really a $within search, sorted by + // distance. We attach a custom $within : annulus matcher to do the $within search, + // and adjust max/min bounds slightly since, as above, containment does not mean the + // distance calculation won't slightly overflow the boundary. // - // Setup the stages for this interval - // - - IndexScanParams scanParams; - scanParams.descriptor = _twoDIndex; - scanParams.direction = 1; - // We use a filter on the key. The filter rejects keys that don't intersect with the - // annulus. An object that is in the annulus might have a key that's not in it and a key - // that's in it. As such we can't just look at one key per object. + // The code below adjusts: + // 1) Overall min/max bounds of the generated distance intervals to be more inclusive + // 2) Bounds of the interval covering to be more inclusive + // ... and later on we add the custom $within : annulus matcher. // - // This does force us to do our own deduping of results, though. - scanParams.doNotDedup = true; - - // Scan bounds on 2D indexes are only over the 2D field - other bounds aren't applicable. - // This is handled in query planning. - scanParams.bounds = _nearParams.baseBounds; - - // The "2d" field is always the first in the index - const string twoDFieldName = _nearParams.nearQuery->field; - const int twoDFieldPosition = 0; - - OrderedIntervalList coveredIntervals; - coveredIntervals.name = scanParams.bounds.fields[twoDFieldPosition].name; - - ExpressionMapping::cover2d(*coverRegion, - _twoDIndex->infoObj(), - internalGeoNearQuery2DMaxCoveringCells, - &coveredIntervals); - - // Intersect the $near bounds we just generated into the bounds we have for anything else - // in the scan (i.e. $within) - IndexBoundsBuilder::intersectize(coveredIntervals, - &scanParams.bounds.fields[twoDFieldPosition]); - - // These parameters are stored by the index, and so must be ok - GeoHashConverter::Parameters hashParams; - GeoHashConverter::parseParameters(_twoDIndex->infoObj(), &hashParams); - - MatchExpression* keyMatcher = - new TwoDKeyInRegionExpression(coverRegion.release(), - hashParams, - twoDFieldName); - - // 2D indexes support covered search over additional fields they contain - // TODO: Don't need to clone, can just attach to custom matcher above - if (_nearParams.filter) { - AndMatchExpression* andMatcher = new AndMatchExpression(); - andMatcher->add(keyMatcher); - andMatcher->add(_nearParams.filter->shallowClone()); - keyMatcher = andMatcher; + // IMPORTANT: The *internal* interval distance bounds are *exact thresholds* - these + // should not be adjusted. + // TODO: Maybe integrate annuluses as a standard shape, and literally transform $near + // internally into a $within query with $near just as sort. + + // Compute the maximum axis-aligned distance error + const double epsilon = std::numeric_limits<double>::epsilon() * + (max(abs(_fullBounds.center().x), abs(_fullBounds.center().y)) + + _fullBounds.getOuter()); + + if (nextBounds.getInner() > 0 && nextBounds.getInner() == _fullBounds.getInner()) { + nextBounds = R2Annulus(nextBounds.center(), + max(0.0, nextBounds.getInner() - epsilon), + nextBounds.getOuter()); } - // IndexScanWithMatch owns the matcher - IndexScan* scan = new IndexScanWithMatch(txn, scanParams, workingSet, keyMatcher); - - MatchExpression* docMatcher = NULL; - - // FLAT searches need to add an additional annulus $within matcher, see above - if (FLAT == queryCRS) { - docMatcher = new TwoDPtInAnnulusExpression(_fullBounds, twoDFieldName); + if (nextBounds.getOuter() > 0 && nextBounds.getOuter() == _fullBounds.getOuter()) { + // We're at the max bound of the search, adjust interval maximum + nextBounds = R2Annulus( + nextBounds.center(), nextBounds.getInner(), nextBounds.getOuter() + epsilon); } - // FetchStage owns index scan - FetchStage* fetcher(new FetchStageWithMatch(txn, - workingSet, - scan, - docMatcher, - collection)); - - return StatusWith<CoveredInterval*>(new CoveredInterval(fetcher, - true, - nextBounds.getInner(), - nextBounds.getOuter(), - isLastInterval)); - } + // *Always* adjust the covering bounds to be more inclusive + coverRegion.reset(new R2Annulus(nextBounds.center(), + max(0.0, nextBounds.getInner() - epsilon), + nextBounds.getOuter() + epsilon)); + } else { + invariant(SPHERE == queryCRS); + // TODO: As above, make this consistent with $within : $centerSphere - StatusWith<double> GeoNear2DStage::computeDistance(WorkingSetMember* member) { - return computeGeoNearDistance(_nearParams, member); + // Our intervals aren't in the same CRS as our index, so we need to adjust them + coverRegion.reset(new R2Annulus(projectBoundsToTwoDDegrees(nextBounds))); } // - // GeoNear2DSphereStage + // Setup the stages for this interval // - static int getFieldPosition(const IndexDescriptor* index, const string& fieldName) { - - int fieldPosition = 0; - - BSONObjIterator specIt(index->keyPattern()); - while (specIt.more()) { - if (specIt.next().fieldName() == fieldName) { - break; - } - ++fieldPosition; - } - - if (fieldPosition == index->keyPattern().nFields()) - return -1; - - return fieldPosition; + IndexScanParams scanParams; + scanParams.descriptor = _twoDIndex; + scanParams.direction = 1; + // We use a filter on the key. The filter rejects keys that don't intersect with the + // annulus. An object that is in the annulus might have a key that's not in it and a key + // that's in it. As such we can't just look at one key per object. + // + // This does force us to do our own deduping of results, though. + scanParams.doNotDedup = true; + + // Scan bounds on 2D indexes are only over the 2D field - other bounds aren't applicable. + // This is handled in query planning. + scanParams.bounds = _nearParams.baseBounds; + + // The "2d" field is always the first in the index + const string twoDFieldName = _nearParams.nearQuery->field; + const int twoDFieldPosition = 0; + + OrderedIntervalList coveredIntervals; + coveredIntervals.name = scanParams.bounds.fields[twoDFieldPosition].name; + + ExpressionMapping::cover2d(*coverRegion, + _twoDIndex->infoObj(), + internalGeoNearQuery2DMaxCoveringCells, + &coveredIntervals); + + // Intersect the $near bounds we just generated into the bounds we have for anything else + // in the scan (i.e. $within) + IndexBoundsBuilder::intersectize(coveredIntervals, + &scanParams.bounds.fields[twoDFieldPosition]); + + // These parameters are stored by the index, and so must be ok + GeoHashConverter::Parameters hashParams; + GeoHashConverter::parseParameters(_twoDIndex->infoObj(), &hashParams); + + MatchExpression* keyMatcher = + new TwoDKeyInRegionExpression(coverRegion.release(), hashParams, twoDFieldName); + + // 2D indexes support covered search over additional fields they contain + // TODO: Don't need to clone, can just attach to custom matcher above + if (_nearParams.filter) { + AndMatchExpression* andMatcher = new AndMatchExpression(); + andMatcher->add(keyMatcher); + andMatcher->add(_nearParams.filter->shallowClone()); + keyMatcher = andMatcher; } - static const string kS2IndexNearStage("GEO_NEAR_2DSPHERE"); - - GeoNear2DSphereStage::GeoNear2DSphereStage(const GeoNearParams& nearParams, - OperationContext* txn, - WorkingSet* workingSet, - Collection* collection, - IndexDescriptor* s2Index) - : NearStage(txn, - workingSet, - collection, - new PlanStageStats(CommonStats(kS2IndexNearStage.c_str()), - STAGE_GEO_NEAR_2DSPHERE)), - _nearParams(nearParams), - _s2Index(s2Index), - _fullBounds(geoNearDistanceBounds(*nearParams.nearQuery)), - _currBounds(_fullBounds.center(), -1, _fullBounds.getInner()), - _boundsIncrement(0.0) { - - getNearStats()->keyPattern = s2Index->keyPattern(); - getNearStats()->indexName = s2Index->indexName(); - } + // IndexScanWithMatch owns the matcher + IndexScan* scan = new IndexScanWithMatch(txn, scanParams, workingSet, keyMatcher); - GeoNear2DSphereStage::~GeoNear2DSphereStage() { + MatchExpression* docMatcher = NULL; + + // FLAT searches need to add an additional annulus $within matcher, see above + if (FLAT == queryCRS) { + docMatcher = new TwoDPtInAnnulusExpression(_fullBounds, twoDFieldName); } - namespace { - - S2Region* buildS2Region(const R2Annulus& sphereBounds) { - // Internal bounds come in SPHERE CRS units - // i.e. center is lon/lat, inner/outer are in meters - S2LatLng latLng = S2LatLng::FromDegrees(sphereBounds.center().y, - sphereBounds.center().x); - - vector<S2Region*> regions; - - S2Cap innerCap = S2Cap::FromAxisAngle(latLng.ToPoint(), - S1Angle::Radians(sphereBounds.getInner() - / kRadiusOfEarthInMeters)); - innerCap = innerCap.Complement(); - regions.push_back(new S2Cap(innerCap)); - - // We only need to max bound if this is not a full search of the Earth - // Using the constant here is important since we use the min of kMaxEarthDistance - // and the actual bounds passed in to set up the search area. - if (sphereBounds.getOuter() < kMaxEarthDistanceInMeters) { - S2Cap outerCap = S2Cap::FromAxisAngle(latLng.ToPoint(), - S1Angle::Radians(sphereBounds.getOuter() - / kRadiusOfEarthInMeters)); - regions.push_back(new S2Cap(outerCap)); - } + // FetchStage owns index scan + FetchStage* fetcher(new FetchStageWithMatch(txn, workingSet, scan, docMatcher, collection)); - // Takes ownership of caps - return new S2RegionIntersection(®ions); - } + return StatusWith<CoveredInterval*>(new CoveredInterval( + fetcher, true, nextBounds.getInner(), nextBounds.getOuter(), isLastInterval)); +} - /** - * Expression which checks whether a 2DSphere key for a point (S2 hash) intersects our - * search region. The search region may have been formed by more granular hashes. - */ - class TwoDSphereKeyInRegionExpression : public LeafMatchExpression { - public: +StatusWith<double> GeoNear2DStage::computeDistance(WorkingSetMember* member) { + return computeGeoNearDistance(_nearParams, member); +} - TwoDSphereKeyInRegionExpression(const R2Annulus& bounds, StringData twoDSpherePath) - : LeafMatchExpression(INTERNAL_2DSPHERE_KEY_IN_REGION), - _region(buildS2Region(bounds)) { +// +// GeoNear2DSphereStage +// - initPath(twoDSpherePath); - } +static int getFieldPosition(const IndexDescriptor* index, const string& fieldName) { + int fieldPosition = 0; - virtual ~TwoDSphereKeyInRegionExpression() { - } + BSONObjIterator specIt(index->keyPattern()); + while (specIt.more()) { + if (specIt.next().fieldName() == fieldName) { + break; + } + ++fieldPosition; + } - virtual void toBSON(BSONObjBuilder* out) const { - out->append("TwoDSphereKeyInRegionExpression", true); - } + if (fieldPosition == index->keyPattern().nFields()) + return -1; + + return fieldPosition; +} + +static const string kS2IndexNearStage("GEO_NEAR_2DSPHERE"); + +GeoNear2DSphereStage::GeoNear2DSphereStage(const GeoNearParams& nearParams, + OperationContext* txn, + WorkingSet* workingSet, + Collection* collection, + IndexDescriptor* s2Index) + : NearStage( + txn, + workingSet, + collection, + new PlanStageStats(CommonStats(kS2IndexNearStage.c_str()), STAGE_GEO_NEAR_2DSPHERE)), + _nearParams(nearParams), + _s2Index(s2Index), + _fullBounds(geoNearDistanceBounds(*nearParams.nearQuery)), + _currBounds(_fullBounds.center(), -1, _fullBounds.getInner()), + _boundsIncrement(0.0) { + getNearStats()->keyPattern = s2Index->keyPattern(); + getNearStats()->indexName = s2Index->indexName(); +} + +GeoNear2DSphereStage::~GeoNear2DSphereStage() {} + +namespace { + +S2Region* buildS2Region(const R2Annulus& sphereBounds) { + // Internal bounds come in SPHERE CRS units + // i.e. center is lon/lat, inner/outer are in meters + S2LatLng latLng = S2LatLng::FromDegrees(sphereBounds.center().y, sphereBounds.center().x); + + vector<S2Region*> regions; + + S2Cap innerCap = S2Cap::FromAxisAngle( + latLng.ToPoint(), S1Angle::Radians(sphereBounds.getInner() / kRadiusOfEarthInMeters)); + innerCap = innerCap.Complement(); + regions.push_back(new S2Cap(innerCap)); + + // We only need to max bound if this is not a full search of the Earth + // Using the constant here is important since we use the min of kMaxEarthDistance + // and the actual bounds passed in to set up the search area. + if (sphereBounds.getOuter() < kMaxEarthDistanceInMeters) { + S2Cap outerCap = S2Cap::FromAxisAngle( + latLng.ToPoint(), S1Angle::Radians(sphereBounds.getOuter() / kRadiusOfEarthInMeters)); + regions.push_back(new S2Cap(outerCap)); + } - virtual bool matchesSingleElement(const BSONElement& e) const { - // Something has gone terribly wrong if this doesn't hold. - invariant(String == e.type()); - S2Cell keyCell = S2Cell(S2CellId::FromString(e.str())); - return _region->MayIntersect(keyCell); - } + // Takes ownership of caps + return new S2RegionIntersection(®ions); +} - const S2Region& getRegion() { - return *_region; - } +/** + * Expression which checks whether a 2DSphere key for a point (S2 hash) intersects our + * search region. The search region may have been formed by more granular hashes. + */ +class TwoDSphereKeyInRegionExpression : public LeafMatchExpression { +public: + TwoDSphereKeyInRegionExpression(const R2Annulus& bounds, StringData twoDSpherePath) + : LeafMatchExpression(INTERNAL_2DSPHERE_KEY_IN_REGION), _region(buildS2Region(bounds)) { + initPath(twoDSpherePath); + } - // - // These won't be called. - // + virtual ~TwoDSphereKeyInRegionExpression() {} - virtual void debugString(StringBuilder& debug, int level = 0) const { - invariant(false); - } + virtual void toBSON(BSONObjBuilder* out) const { + out->append("TwoDSphereKeyInRegionExpression", true); + } - virtual bool equivalent(const MatchExpression* other) const { - invariant(false); - return true; - } + virtual bool matchesSingleElement(const BSONElement& e) const { + // Something has gone terribly wrong if this doesn't hold. + invariant(String == e.type()); + S2Cell keyCell = S2Cell(S2CellId::FromString(e.str())); + return _region->MayIntersect(keyCell); + } - virtual MatchExpression* shallowClone() const { - invariant(false); - return NULL; - } + const S2Region& getRegion() { + return *_region; + } - private: + // + // These won't be called. + // - const unique_ptr<S2Region> _region; - }; + virtual void debugString(StringBuilder& debug, int level = 0) const { + invariant(false); } - // Estimate the density of data by search the nearest cells level by level around center. - class GeoNear2DSphereStage::DensityEstimator { - public: - DensityEstimator(const IndexDescriptor* s2Index, const GeoNearParams* nearParams) : - _s2Index(s2Index), _nearParams(nearParams), _currentLevel(0) - { - S2IndexingParams params; - ExpressionParams::parse2dsphereParams(_s2Index->infoObj(), ¶ms); - // Since cellId.AppendVertexNeighbors(level, output) requires level < cellId.level(), - // we have to start to find documents at most S2::kMaxCellLevel - 1. Thus the finest - // search area is 16 * finest cell area at S2::kMaxCellLevel, which is less than - // (1.4 inch X 1.4 inch) on the earth. - _currentLevel = std::max(0, params.finestIndexedLevel - 1); - } + virtual bool equivalent(const MatchExpression* other) const { + invariant(false); + return true; + } - // Search for a document in neighbors at current level. - // Return IS_EOF is such document exists and set the estimated distance to the nearest doc. - PlanStage::StageState work(OperationContext* txn, - WorkingSet* workingSet, - Collection* collection, - WorkingSetID* out, - double* estimatedDistance); - - void saveState(); - void restoreState(OperationContext* txn); - void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - - private: - void buildIndexScan(OperationContext* txn, WorkingSet* workingSet, Collection* collection); - - const IndexDescriptor* _s2Index; // Not owned here. - const GeoNearParams* _nearParams; // Not owned here. - int _currentLevel; - unique_ptr<IndexScan> _indexScan; - }; - - // Setup the index scan stage for neighbors at this level. - void GeoNear2DSphereStage::DensityEstimator::buildIndexScan(OperationContext* txn, - WorkingSet* workingSet, - Collection* collection) - { - IndexScanParams scanParams; - scanParams.descriptor = _s2Index; - scanParams.direction = 1; - scanParams.doNotDedup = true; - scanParams.bounds = _nearParams->baseBounds; - - // Because the planner doesn't yet set up 2D index bounds, do it ourselves here - const string s2Field = _nearParams->nearQuery->field; - const int s2FieldPosition = getFieldPosition(_s2Index, s2Field); - fassert(28677, s2FieldPosition >= 0); - OrderedIntervalList* coveredIntervals = &scanParams.bounds.fields[s2FieldPosition]; - coveredIntervals->intervals.clear(); - - // Find 4 neighbors (3 neighbors at face vertex) at current level. - const S2CellId& centerId = _nearParams->nearQuery->centroid->cell.id(); - vector<S2CellId> neighbors; - - // The search area expands 4X each time. - // Return the neighbors of closest vertex to this cell at the given level. - invariant(_currentLevel < centerId.level()); - centerId.AppendVertexNeighbors(_currentLevel, &neighbors); - - // Convert S2CellId to string and sort - vector<string> neighborKeys; - for (vector<S2CellId>::const_iterator it = neighbors.begin(); it != neighbors.end(); it++) { - neighborKeys.push_back(it->toString()); - } - std::sort(neighborKeys.begin(), neighborKeys.end()); - - for (vector<string>::const_iterator it = neighborKeys.begin(); it != neighborKeys.end(); - it++) - { - // construct interval [*it, end) for this cell. - std::string end = *it; - end[end.size() - 1]++; - coveredIntervals->intervals.push_back( - IndexBoundsBuilder::makeRangeInterval(*it, end, true, false)); - } + virtual MatchExpression* shallowClone() const { + invariant(false); + return NULL; + } - invariant(coveredIntervals->isValidFor(1)); +private: + const unique_ptr<S2Region> _region; +}; +} + +// Estimate the density of data by search the nearest cells level by level around center. +class GeoNear2DSphereStage::DensityEstimator { +public: + DensityEstimator(const IndexDescriptor* s2Index, const GeoNearParams* nearParams) + : _s2Index(s2Index), _nearParams(nearParams), _currentLevel(0) { + S2IndexingParams params; + ExpressionParams::parse2dsphereParams(_s2Index->infoObj(), ¶ms); + // Since cellId.AppendVertexNeighbors(level, output) requires level < cellId.level(), + // we have to start to find documents at most S2::kMaxCellLevel - 1. Thus the finest + // search area is 16 * finest cell area at S2::kMaxCellLevel, which is less than + // (1.4 inch X 1.4 inch) on the earth. + _currentLevel = std::max(0, params.finestIndexedLevel - 1); + } - // Index scan - _indexScan.reset(new IndexScan(txn, scanParams, workingSet, NULL)); + // Search for a document in neighbors at current level. + // Return IS_EOF is such document exists and set the estimated distance to the nearest doc. + PlanStage::StageState work(OperationContext* txn, + WorkingSet* workingSet, + Collection* collection, + WorkingSetID* out, + double* estimatedDistance); + + void saveState(); + void restoreState(OperationContext* txn); + void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + +private: + void buildIndexScan(OperationContext* txn, WorkingSet* workingSet, Collection* collection); + + const IndexDescriptor* _s2Index; // Not owned here. + const GeoNearParams* _nearParams; // Not owned here. + int _currentLevel; + unique_ptr<IndexScan> _indexScan; +}; + +// Setup the index scan stage for neighbors at this level. +void GeoNear2DSphereStage::DensityEstimator::buildIndexScan(OperationContext* txn, + WorkingSet* workingSet, + Collection* collection) { + IndexScanParams scanParams; + scanParams.descriptor = _s2Index; + scanParams.direction = 1; + scanParams.doNotDedup = true; + scanParams.bounds = _nearParams->baseBounds; + + // Because the planner doesn't yet set up 2D index bounds, do it ourselves here + const string s2Field = _nearParams->nearQuery->field; + const int s2FieldPosition = getFieldPosition(_s2Index, s2Field); + fassert(28677, s2FieldPosition >= 0); + OrderedIntervalList* coveredIntervals = &scanParams.bounds.fields[s2FieldPosition]; + coveredIntervals->intervals.clear(); + + // Find 4 neighbors (3 neighbors at face vertex) at current level. + const S2CellId& centerId = _nearParams->nearQuery->centroid->cell.id(); + vector<S2CellId> neighbors; + + // The search area expands 4X each time. + // Return the neighbors of closest vertex to this cell at the given level. + invariant(_currentLevel < centerId.level()); + centerId.AppendVertexNeighbors(_currentLevel, &neighbors); + + // Convert S2CellId to string and sort + vector<string> neighborKeys; + for (vector<S2CellId>::const_iterator it = neighbors.begin(); it != neighbors.end(); it++) { + neighborKeys.push_back(it->toString()); + } + std::sort(neighborKeys.begin(), neighborKeys.end()); + + for (vector<string>::const_iterator it = neighborKeys.begin(); it != neighborKeys.end(); it++) { + // construct interval [*it, end) for this cell. + std::string end = *it; + end[end.size() - 1]++; + coveredIntervals->intervals.push_back( + IndexBoundsBuilder::makeRangeInterval(*it, end, true, false)); } - PlanStage::StageState GeoNear2DSphereStage::DensityEstimator::work(OperationContext* txn, - WorkingSet* workingSet, - Collection* collection, - WorkingSetID* out, - double* estimatedDistance) - { - if (!_indexScan) { - // Setup index scan stage for current level. - buildIndexScan(txn, workingSet, collection); - } + invariant(coveredIntervals->isValidFor(1)); - WorkingSetID workingSetID; - PlanStage::StageState state = _indexScan->work(&workingSetID); - - if (state == PlanStage::IS_EOF) { - // We ran through the neighbors but found nothing. - if (_currentLevel > 0) { - // Advance to the next level and search again. - _currentLevel--; - // Reset index scan for the next level. - _indexScan.reset(NULL); - return PlanStage::NEED_TIME; - } + // Index scan + _indexScan.reset(new IndexScan(txn, scanParams, workingSet, NULL)); +} + +PlanStage::StageState GeoNear2DSphereStage::DensityEstimator::work(OperationContext* txn, + WorkingSet* workingSet, + Collection* collection, + WorkingSetID* out, + double* estimatedDistance) { + if (!_indexScan) { + // Setup index scan stage for current level. + buildIndexScan(txn, workingSet, collection); + } - // We are already at the top level. - *estimatedDistance = S2::kAvgEdge.GetValue(_currentLevel) * kRadiusOfEarthInMeters; - return PlanStage::IS_EOF; - } else if (state == PlanStage::ADVANCED) { - // We found something! - *estimatedDistance = S2::kAvgEdge.GetValue(_currentLevel) * kRadiusOfEarthInMeters; - // Clean up working set. - workingSet->free(workingSetID); - return PlanStage::IS_EOF; - } else if (state == PlanStage::NEED_YIELD) { - *out = workingSetID; + WorkingSetID workingSetID; + PlanStage::StageState state = _indexScan->work(&workingSetID); + + if (state == PlanStage::IS_EOF) { + // We ran through the neighbors but found nothing. + if (_currentLevel > 0) { + // Advance to the next level and search again. + _currentLevel--; + // Reset index scan for the next level. + _indexScan.reset(NULL); + return PlanStage::NEED_TIME; } - // Propagate NEED_TIME or errors - return state; + // We are already at the top level. + *estimatedDistance = S2::kAvgEdge.GetValue(_currentLevel) * kRadiusOfEarthInMeters; + return PlanStage::IS_EOF; + } else if (state == PlanStage::ADVANCED) { + // We found something! + *estimatedDistance = S2::kAvgEdge.GetValue(_currentLevel) * kRadiusOfEarthInMeters; + // Clean up working set. + workingSet->free(workingSetID); + return PlanStage::IS_EOF; + } else if (state == PlanStage::NEED_YIELD) { + *out = workingSetID; } - void GeoNear2DSphereStage::DensityEstimator::saveState() { - if (_indexScan) { - _indexScan->saveState(); - } + // Propagate NEED_TIME or errors + return state; +} + +void GeoNear2DSphereStage::DensityEstimator::saveState() { + if (_indexScan) { + _indexScan->saveState(); } +} - void GeoNear2DSphereStage::DensityEstimator::restoreState(OperationContext* txn) { - if (_indexScan) { - _indexScan->restoreState(txn); - } +void GeoNear2DSphereStage::DensityEstimator::restoreState(OperationContext* txn) { + if (_indexScan) { + _indexScan->restoreState(txn); } +} - void GeoNear2DSphereStage::DensityEstimator::invalidate(OperationContext* txn, - const RecordId& dl, - InvalidationType type) { - if (_indexScan) { - _indexScan->invalidate(txn, dl, type); - } +void GeoNear2DSphereStage::DensityEstimator::invalidate(OperationContext* txn, + const RecordId& dl, + InvalidationType type) { + if (_indexScan) { + _indexScan->invalidate(txn, dl, type); } +} - PlanStage::StageState GeoNear2DSphereStage::initialize(OperationContext* txn, - WorkingSet* workingSet, - Collection* collection, - WorkingSetID* out) - { - if (!_densityEstimator) { - _densityEstimator.reset(new DensityEstimator(_s2Index, &_nearParams)); - } +PlanStage::StageState GeoNear2DSphereStage::initialize(OperationContext* txn, + WorkingSet* workingSet, + Collection* collection, + WorkingSetID* out) { + if (!_densityEstimator) { + _densityEstimator.reset(new DensityEstimator(_s2Index, &_nearParams)); + } - double estimatedDistance; - PlanStage::StageState state = _densityEstimator->work(txn, workingSet, collection, out, - &estimatedDistance); - - if (state == IS_EOF) { - // We find a document in 4 neighbors at current level, but didn't at previous level. - // - // Assuming cell size at current level is d and data is even distributed, the distance - // between two nearest points are at least d. The following circle with radius of 3 * d - // covers PI * 9 * d^2, giving at most 30 documents. - // - // At the coarsest level, the search area is the whole earth. - _boundsIncrement = 3 * estimatedDistance; - invariant(_boundsIncrement > 0.0); + double estimatedDistance; + PlanStage::StageState state = + _densityEstimator->work(txn, workingSet, collection, out, &estimatedDistance); - // Clean up - _densityEstimator.reset(NULL); - } + if (state == IS_EOF) { + // We find a document in 4 neighbors at current level, but didn't at previous level. + // + // Assuming cell size at current level is d and data is even distributed, the distance + // between two nearest points are at least d. The following circle with radius of 3 * d + // covers PI * 9 * d^2, giving at most 30 documents. + // + // At the coarsest level, the search area is the whole earth. + _boundsIncrement = 3 * estimatedDistance; + invariant(_boundsIncrement > 0.0); - return state; + // Clean up + _densityEstimator.reset(NULL); } - void GeoNear2DSphereStage::finishSaveState() { - if (_densityEstimator) { - _densityEstimator->saveState(); - } + return state; +} + +void GeoNear2DSphereStage::finishSaveState() { + if (_densityEstimator) { + _densityEstimator->saveState(); } +} - void GeoNear2DSphereStage::finishRestoreState(OperationContext* txn) { - if (_densityEstimator) { - _densityEstimator->restoreState(txn); - } +void GeoNear2DSphereStage::finishRestoreState(OperationContext* txn) { + if (_densityEstimator) { + _densityEstimator->restoreState(txn); } +} - void GeoNear2DSphereStage::finishInvalidate(OperationContext* txn, - const RecordId& dl, - InvalidationType type) { - if (_densityEstimator) { - _densityEstimator->invalidate(txn, dl, type); - } +void GeoNear2DSphereStage::finishInvalidate(OperationContext* txn, + const RecordId& dl, + InvalidationType type) { + if (_densityEstimator) { + _densityEstimator->invalidate(txn, dl, type); } +} - StatusWith<NearStage::CoveredInterval*> // +StatusWith<NearStage::CoveredInterval*> // GeoNear2DSphereStage::nextInterval(OperationContext* txn, WorkingSet* workingSet, Collection* collection) { + // The search is finished if we searched at least once and all the way to the edge + if (_currBounds.getInner() >= 0 && _currBounds.getOuter() == _fullBounds.getOuter()) { + return StatusWith<CoveredInterval*>(NULL); + } - // The search is finished if we searched at least once and all the way to the edge - if (_currBounds.getInner() >= 0 && _currBounds.getOuter() == _fullBounds.getOuter()) { - return StatusWith<CoveredInterval*>(NULL); - } + // + // Setup the next interval + // - // - // Setup the next interval - // + const NearStats* stats = getNearStats(); - const NearStats* stats = getNearStats(); + if (!stats->intervalStats.empty()) { + const IntervalStats& lastIntervalStats = stats->intervalStats.back(); - if (!stats->intervalStats.empty()) { + // TODO: Generally we want small numbers of results fast, then larger numbers later + if (lastIntervalStats.numResultsBuffered < 300) + _boundsIncrement *= 2; + else if (lastIntervalStats.numResultsBuffered > 600) + _boundsIncrement /= 2; + } - const IntervalStats& lastIntervalStats = stats->intervalStats.back(); + invariant(_boundsIncrement > 0.0); - // TODO: Generally we want small numbers of results fast, then larger numbers later - if (lastIntervalStats.numResultsBuffered < 300) - _boundsIncrement *= 2; - else if (lastIntervalStats.numResultsBuffered > 600) - _boundsIncrement /= 2; - } + R2Annulus nextBounds(_currBounds.center(), + _currBounds.getOuter(), + min(_currBounds.getOuter() + _boundsIncrement, _fullBounds.getOuter())); - invariant(_boundsIncrement > 0.0); + bool isLastInterval = (nextBounds.getOuter() == _fullBounds.getOuter()); + _currBounds = nextBounds; - R2Annulus nextBounds(_currBounds.center(), - _currBounds.getOuter(), - min(_currBounds.getOuter() + _boundsIncrement, - _fullBounds.getOuter())); + // + // Setup the covering region and stages for this interval + // - bool isLastInterval = (nextBounds.getOuter() == _fullBounds.getOuter()); - _currBounds = nextBounds; + IndexScanParams scanParams; + scanParams.descriptor = _s2Index; + scanParams.direction = 1; + // We use a filter on the key. The filter rejects keys that don't intersect with the + // annulus. An object that is in the annulus might have a key that's not in it and a key + // that's in it. As such we can't just look at one key per object. + // + // This does force us to do our own deduping of results, though. + scanParams.doNotDedup = true; + scanParams.bounds = _nearParams.baseBounds; - // - // Setup the covering region and stages for this interval - // + // Because the planner doesn't yet set up 2D index bounds, do it ourselves here + const string s2Field = _nearParams.nearQuery->field; + const int s2FieldPosition = getFieldPosition(_s2Index, s2Field); + fassert(28678, s2FieldPosition >= 0); + scanParams.bounds.fields[s2FieldPosition].intervals.clear(); + OrderedIntervalList* coveredIntervals = &scanParams.bounds.fields[s2FieldPosition]; - IndexScanParams scanParams; - scanParams.descriptor = _s2Index; - scanParams.direction = 1; - // We use a filter on the key. The filter rejects keys that don't intersect with the - // annulus. An object that is in the annulus might have a key that's not in it and a key - // that's in it. As such we can't just look at one key per object. - // - // This does force us to do our own deduping of results, though. - scanParams.doNotDedup = true; - scanParams.bounds = _nearParams.baseBounds; - - // Because the planner doesn't yet set up 2D index bounds, do it ourselves here - const string s2Field = _nearParams.nearQuery->field; - const int s2FieldPosition = getFieldPosition(_s2Index, s2Field); - fassert(28678, s2FieldPosition >= 0); - scanParams.bounds.fields[s2FieldPosition].intervals.clear(); - OrderedIntervalList* coveredIntervals = &scanParams.bounds.fields[s2FieldPosition]; - - TwoDSphereKeyInRegionExpression* keyMatcher = - new TwoDSphereKeyInRegionExpression(_currBounds, s2Field); - - ExpressionMapping::cover2dsphere(keyMatcher->getRegion(), - _s2Index->infoObj(), - coveredIntervals); - - // IndexScan owns the hash matcher - IndexScan* scan = new IndexScanWithMatch(txn, scanParams, workingSet, keyMatcher); - - // FetchStage owns index scan - FetchStage* fetcher(new FetchStage(txn, workingSet, scan, _nearParams.filter, collection)); - - return StatusWith<CoveredInterval*>(new CoveredInterval(fetcher, - true, - nextBounds.getInner(), - nextBounds.getOuter(), - isLastInterval)); - } + TwoDSphereKeyInRegionExpression* keyMatcher = + new TwoDSphereKeyInRegionExpression(_currBounds, s2Field); - StatusWith<double> GeoNear2DSphereStage::computeDistance(WorkingSetMember* member) { - return computeGeoNearDistance(_nearParams, member); - } + ExpressionMapping::cover2dsphere( + keyMatcher->getRegion(), _s2Index->infoObj(), coveredIntervals); + + // IndexScan owns the hash matcher + IndexScan* scan = new IndexScanWithMatch(txn, scanParams, workingSet, keyMatcher); + + // FetchStage owns index scan + FetchStage* fetcher(new FetchStage(txn, workingSet, scan, _nearParams.filter, collection)); + + return StatusWith<CoveredInterval*>(new CoveredInterval( + fetcher, true, nextBounds.getInner(), nextBounds.getOuter(), isLastInterval)); +} -} // namespace mongo +StatusWith<double> GeoNear2DSphereStage::computeDistance(WorkingSetMember* member) { + return computeGeoNearDistance(_nearParams, member); +} +} // namespace mongo diff --git a/src/mongo/db/exec/geo_near.h b/src/mongo/db/exec/geo_near.h index 52c28ed8303..f9295217992 100644 --- a/src/mongo/db/exec/geo_near.h +++ b/src/mongo/db/exec/geo_near.h @@ -40,137 +40,124 @@ namespace mongo { - /** - * Generic parameters for a GeoNear search - */ - struct GeoNearParams { - - GeoNearParams() : - filter(NULL), nearQuery(NULL), addPointMeta(false), addDistMeta(false) { - } - - // MatchExpression to apply to the index keys and fetched documents - // Not owned here, owned by solution nodes - MatchExpression* filter; - // Index scan bounds, not including the geo bounds - IndexBounds baseBounds; - - // Not owned here - const GeoNearExpression* nearQuery; - bool addPointMeta; - bool addDistMeta; - }; - - /** - * Implementation of GeoNear on top of a 2D index - */ - class GeoNear2DStage : public NearStage { - public: - - GeoNear2DStage(const GeoNearParams& nearParams, - OperationContext* txn, - WorkingSet* workingSet, - Collection* collection, - IndexDescriptor* twoDIndex); - - virtual ~GeoNear2DStage(); - - protected: +/** + * Generic parameters for a GeoNear search + */ +struct GeoNearParams { + GeoNearParams() : filter(NULL), nearQuery(NULL), addPointMeta(false), addDistMeta(false) {} - virtual StatusWith<CoveredInterval*> nextInterval(OperationContext* txn, - WorkingSet* workingSet, - Collection* collection); + // MatchExpression to apply to the index keys and fetched documents + // Not owned here, owned by solution nodes + MatchExpression* filter; + // Index scan bounds, not including the geo bounds + IndexBounds baseBounds; - virtual StatusWith<double> computeDistance(WorkingSetMember* member); + // Not owned here + const GeoNearExpression* nearQuery; + bool addPointMeta; + bool addDistMeta; +}; - virtual PlanStage::StageState initialize(OperationContext* txn, - WorkingSet* workingSet, - Collection* collection, - WorkingSetID* out); +/** + * Implementation of GeoNear on top of a 2D index + */ +class GeoNear2DStage : public NearStage { +public: + GeoNear2DStage(const GeoNearParams& nearParams, + OperationContext* txn, + WorkingSet* workingSet, + Collection* collection, + IndexDescriptor* twoDIndex); - private: + virtual ~GeoNear2DStage(); - virtual void finishSaveState(); +protected: + virtual StatusWith<CoveredInterval*> nextInterval(OperationContext* txn, + WorkingSet* workingSet, + Collection* collection); - virtual void finishRestoreState(OperationContext* txn); + virtual StatusWith<double> computeDistance(WorkingSetMember* member); - virtual void finishInvalidate(OperationContext* txn, - const RecordId& dl, - InvalidationType type); + virtual PlanStage::StageState initialize(OperationContext* txn, + WorkingSet* workingSet, + Collection* collection, + WorkingSetID* out); - const GeoNearParams _nearParams; +private: + virtual void finishSaveState(); - // The 2D index we're searching over - // Not owned here - IndexDescriptor* const _twoDIndex; + virtual void finishRestoreState(OperationContext* txn); - // The total search annulus - const R2Annulus _fullBounds; + virtual void finishInvalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - // The current search annulus - R2Annulus _currBounds; + const GeoNearParams _nearParams; - // Amount to increment the next bounds by - double _boundsIncrement; + // The 2D index we're searching over + // Not owned here + IndexDescriptor* const _twoDIndex; - class DensityEstimator; - std::unique_ptr<DensityEstimator> _densityEstimator; - }; + // The total search annulus + const R2Annulus _fullBounds; - /** - * Implementation of GeoNear on top of a 2DSphere (S2) index - */ - class GeoNear2DSphereStage : public NearStage { - public: + // The current search annulus + R2Annulus _currBounds; - GeoNear2DSphereStage(const GeoNearParams& nearParams, - OperationContext* txn, - WorkingSet* workingSet, - Collection* collection, - IndexDescriptor* s2Index); + // Amount to increment the next bounds by + double _boundsIncrement; - virtual ~GeoNear2DSphereStage(); + class DensityEstimator; + std::unique_ptr<DensityEstimator> _densityEstimator; +}; - protected: +/** + * Implementation of GeoNear on top of a 2DSphere (S2) index + */ +class GeoNear2DSphereStage : public NearStage { +public: + GeoNear2DSphereStage(const GeoNearParams& nearParams, + OperationContext* txn, + WorkingSet* workingSet, + Collection* collection, + IndexDescriptor* s2Index); - virtual StatusWith<CoveredInterval*> nextInterval(OperationContext* txn, - WorkingSet* workingSet, - Collection* collection); + virtual ~GeoNear2DSphereStage(); - virtual StatusWith<double> computeDistance(WorkingSetMember* member); +protected: + virtual StatusWith<CoveredInterval*> nextInterval(OperationContext* txn, + WorkingSet* workingSet, + Collection* collection); - virtual PlanStage::StageState initialize(OperationContext* txn, - WorkingSet* workingSet, - Collection* collection, - WorkingSetID* out); + virtual StatusWith<double> computeDistance(WorkingSetMember* member); - private: + virtual PlanStage::StageState initialize(OperationContext* txn, + WorkingSet* workingSet, + Collection* collection, + WorkingSetID* out); - virtual void finishSaveState(); +private: + virtual void finishSaveState(); - virtual void finishRestoreState(OperationContext* txn); + virtual void finishRestoreState(OperationContext* txn); - virtual void finishInvalidate(OperationContext* txn, - const RecordId& dl, - InvalidationType type); + virtual void finishInvalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - const GeoNearParams _nearParams; + const GeoNearParams _nearParams; - // The 2D index we're searching over - // Not owned here - IndexDescriptor* const _s2Index; + // The 2D index we're searching over + // Not owned here + IndexDescriptor* const _s2Index; - // The total search annulus - const R2Annulus _fullBounds; + // The total search annulus + const R2Annulus _fullBounds; - // The current search annulus - R2Annulus _currBounds; + // The current search annulus + R2Annulus _currBounds; - // Amount to increment the next bounds by - double _boundsIncrement; + // Amount to increment the next bounds by + double _boundsIncrement; - class DensityEstimator; - std::unique_ptr<DensityEstimator> _densityEstimator; - }; + class DensityEstimator; + std::unique_ptr<DensityEstimator> _densityEstimator; +}; -} // namespace mongo +} // namespace mongo diff --git a/src/mongo/db/exec/group.cpp b/src/mongo/db/exec/group.cpp index 68b21928b62..433b4802cdf 100644 --- a/src/mongo/db/exec/group.cpp +++ b/src/mongo/db/exec/group.cpp @@ -38,266 +38,261 @@ namespace mongo { - using std::unique_ptr; - using std::vector; - - namespace { - - // Helper function that extracts the group key from a BSONObj. - Status getKey(const BSONObj& obj, - const BSONObj& keyPattern, - ScriptingFunction func, - Scope* s, - BSONObj* key) { - if (func) { - BSONObjBuilder b(obj.objsize() + 32); - b.append("0", obj); - const BSONObj& k = b.obj(); - int res = s->invoke(func, &k, 0); - if (res != 0) { - return Status(ErrorCodes::BadValue, - str::stream() << "invoke failed in $keyf: " << s->getError()); - } - int type = s->type("__returnValue"); - if (type != Object) { - return Status(ErrorCodes::BadValue, "return of $key has to be an object"); - } - *key = s->getObject("__returnValue"); - return Status::OK(); - } - *key = obj.extractFields(keyPattern, true).getOwned(); - return Status::OK(); +using std::unique_ptr; +using std::vector; + +namespace { + +// Helper function that extracts the group key from a BSONObj. +Status getKey( + const BSONObj& obj, const BSONObj& keyPattern, ScriptingFunction func, Scope* s, BSONObj* key) { + if (func) { + BSONObjBuilder b(obj.objsize() + 32); + b.append("0", obj); + const BSONObj& k = b.obj(); + int res = s->invoke(func, &k, 0); + if (res != 0) { + return Status(ErrorCodes::BadValue, + str::stream() << "invoke failed in $keyf: " << s->getError()); } - - } // namespace - - // static - const char* GroupStage::kStageType = "GROUP"; - - GroupStage::GroupStage(OperationContext* txn, - const GroupRequest& request, - WorkingSet* workingSet, - PlanStage* child) - : _txn(txn), - _request(request), - _ws(workingSet), - _commonStats(kStageType), - _specificStats(), - _child(child), - _groupState(GroupState_Initializing), - _reduceFunction(0), - _keyFunction(0) {} - - void GroupStage::initGroupScripting() { - // Initialize _scope. - const std::string userToken = - AuthorizationSession::get(ClientBasic::getCurrent()) - ->getAuthenticatedUserNamesToken(); - - const NamespaceString nss(_request.ns); - _scope = globalScriptEngine->getPooledScope(_txn, nss.db().toString(), "group" + userToken); - if (!_request.reduceScope.isEmpty()) { - _scope->init(&_request.reduceScope); - } - _scope->setObject("$initial", _request.initial, true); - _scope->exec("$reduce = " + _request.reduceCode, "$group reduce setup", false, true, true, - 2 * 1000); - _scope->exec("$arr = [];", "$group reduce setup 2", false, true, true, 2 * 1000); - - // Initialize _reduceFunction. - _reduceFunction = _scope->createFunction("function(){ " - " if ( $arr[n] == null ){ " - " next = {}; " - " Object.extend( next , $key ); " - " Object.extend( next , $initial , true ); " - " $arr[n] = next; " - " next = null; " - " } " - " $reduce( obj , $arr[n] ); " - "}"); - - // Initialize _keyFunction, if a key function was provided. - if (_request.keyFunctionCode.size()) { - _keyFunction = _scope->createFunction(_request.keyFunctionCode.c_str()); + int type = s->type("__returnValue"); + if (type != Object) { + return Status(ErrorCodes::BadValue, "return of $key has to be an object"); } + *key = s->getObject("__returnValue"); + return Status::OK(); } + *key = obj.extractFields(keyPattern, true).getOwned(); + return Status::OK(); +} + +} // namespace + +// static +const char* GroupStage::kStageType = "GROUP"; + +GroupStage::GroupStage(OperationContext* txn, + const GroupRequest& request, + WorkingSet* workingSet, + PlanStage* child) + : _txn(txn), + _request(request), + _ws(workingSet), + _commonStats(kStageType), + _specificStats(), + _child(child), + _groupState(GroupState_Initializing), + _reduceFunction(0), + _keyFunction(0) {} + +void GroupStage::initGroupScripting() { + // Initialize _scope. + const std::string userToken = + AuthorizationSession::get(ClientBasic::getCurrent())->getAuthenticatedUserNamesToken(); + + const NamespaceString nss(_request.ns); + _scope = globalScriptEngine->getPooledScope(_txn, nss.db().toString(), "group" + userToken); + if (!_request.reduceScope.isEmpty()) { + _scope->init(&_request.reduceScope); + } + _scope->setObject("$initial", _request.initial, true); + _scope->exec( + "$reduce = " + _request.reduceCode, "$group reduce setup", false, true, true, 2 * 1000); + _scope->exec("$arr = [];", "$group reduce setup 2", false, true, true, 2 * 1000); + + // Initialize _reduceFunction. + _reduceFunction = _scope->createFunction( + "function(){ " + " if ( $arr[n] == null ){ " + " next = {}; " + " Object.extend( next , $key ); " + " Object.extend( next , $initial , true ); " + " $arr[n] = next; " + " next = null; " + " } " + " $reduce( obj , $arr[n] ); " + "}"); + + // Initialize _keyFunction, if a key function was provided. + if (_request.keyFunctionCode.size()) { + _keyFunction = _scope->createFunction(_request.keyFunctionCode.c_str()); + } +} - Status GroupStage::processObject(const BSONObj& obj) { - BSONObj key; - Status getKeyStatus = getKey(obj, _request.keyPattern, _keyFunction, _scope.get(), - &key); - if (!getKeyStatus.isOK()) { - return getKeyStatus; - } - - int& n = _groupMap[key]; - if (n == 0) { - n = _groupMap.size(); - _scope->setObject("$key", key, true); - if (n > 20000) { - return Status(ErrorCodes::BadValue, - "group() can't handle more than 20000 unique keys"); - } - } - - _scope->setObject("obj", obj, true); - _scope->setNumber("n", n - 1); - if (_scope->invoke(_reduceFunction, 0, 0, 0, true)) { - return Status(ErrorCodes::BadValue, - str::stream() << "reduce invoke failed: " << _scope->getError()); - } - - return Status::OK(); +Status GroupStage::processObject(const BSONObj& obj) { + BSONObj key; + Status getKeyStatus = getKey(obj, _request.keyPattern, _keyFunction, _scope.get(), &key); + if (!getKeyStatus.isOK()) { + return getKeyStatus; } - BSONObj GroupStage::finalizeResults() { - if (!_request.finalize.empty()) { - _scope->exec("$finalize = " + _request.finalize, "$group finalize define", false, - true, true, 2 * 1000); - ScriptingFunction finalizeFunction = - _scope->createFunction("function(){ " - " for(var i=0; i < $arr.length; i++){ " - " var ret = $finalize($arr[i]); " - " if (ret !== undefined) " - " $arr[i] = ret; " - " } " - "}"); - _scope->invoke(finalizeFunction, 0, 0, 0, true); + int& n = _groupMap[key]; + if (n == 0) { + n = _groupMap.size(); + _scope->setObject("$key", key, true); + if (n > 20000) { + return Status(ErrorCodes::BadValue, "group() can't handle more than 20000 unique keys"); } - - _specificStats.nGroups = _groupMap.size(); - - BSONObj results = _scope->getObject("$arr").getOwned(); - - _scope->exec("$arr = [];", "$group reduce setup 2", false, true, true, 2 * 1000); - _scope->gc(); - - return results; } - PlanStage::StageState GroupStage::work(WorkingSetID* out) { - ++_commonStats.works; + _scope->setObject("obj", obj, true); + _scope->setNumber("n", n - 1); + if (_scope->invoke(_reduceFunction, 0, 0, 0, true)) { + return Status(ErrorCodes::BadValue, + str::stream() << "reduce invoke failed: " << _scope->getError()); + } - ScopedTimer timer(&_commonStats.executionTimeMillis); + return Status::OK(); +} - if (isEOF()) { return PlanStage::IS_EOF; } +BSONObj GroupStage::finalizeResults() { + if (!_request.finalize.empty()) { + _scope->exec("$finalize = " + _request.finalize, + "$group finalize define", + false, + true, + true, + 2 * 1000); + ScriptingFunction finalizeFunction = _scope->createFunction( + "function(){ " + " for(var i=0; i < $arr.length; i++){ " + " var ret = $finalize($arr[i]); " + " if (ret !== undefined) " + " $arr[i] = ret; " + " } " + "}"); + _scope->invoke(finalizeFunction, 0, 0, 0, true); + } - // On the first call to work(), call initGroupScripting(). - if (_groupState == GroupState_Initializing) { - initGroupScripting(); - _groupState = GroupState_ReadingFromChild; - ++_commonStats.needTime; - return PlanStage::NEED_TIME; - } + _specificStats.nGroups = _groupMap.size(); - // Otherwise, read from our child. - invariant(_groupState == GroupState_ReadingFromChild); - WorkingSetID id = WorkingSet::INVALID_ID; - StageState state = _child->work(&id); + BSONObj results = _scope->getObject("$arr").getOwned(); - if (PlanStage::NEED_TIME == state) { - ++_commonStats.needTime; - return state; - } - else if (PlanStage::NEED_YIELD == state) { - ++_commonStats.needYield; - *out = id; - return state; - } - else if (PlanStage::FAILURE == state) { - *out = id; - // If a stage fails, it may create a status WSM to indicate why it failed, in which - // case 'id' is valid. If ID is invalid, we create our own error message. - if (WorkingSet::INVALID_ID == id) { - const std::string errmsg = "group stage failed to read in results from child"; - *out = WorkingSetCommon::allocateStatusMember(_ws, - Status(ErrorCodes::InternalError, - errmsg)); - } - return state; - } - else if (PlanStage::DEAD == state) { - return state; - } - else if (PlanStage::ADVANCED == state) { - WorkingSetMember* member = _ws->get(id); - // Group queries can't have projections. This means that covering analysis will always - // add a fetch. We should always get fetched data, and never just key data. - invariant(member->hasObj()); - - Status status = processObject(member->obj.value()); - if (!status.isOK()) { - *out = WorkingSetCommon::allocateStatusMember(_ws, status); - return PlanStage::FAILURE; - } - - _ws->free(id); - - ++_commonStats.needTime; - return PlanStage::NEED_TIME; - } - else { - // We're done reading from our child. - invariant(PlanStage::IS_EOF == state); + _scope->exec("$arr = [];", "$group reduce setup 2", false, true, true, 2 * 1000); + _scope->gc(); - // Transition to state "done." Future calls to work() will return IS_EOF. - _groupState = GroupState_Done; + return results; +} - BSONObj results = finalizeResults(); +PlanStage::StageState GroupStage::work(WorkingSetID* out) { + ++_commonStats.works; - *out = _ws->allocate(); - WorkingSetMember* member = _ws->get(*out); - member->obj = Snapshotted<BSONObj>(SnapshotId(), results); - member->state = WorkingSetMember::OWNED_OBJ; + ScopedTimer timer(&_commonStats.executionTimeMillis); - ++_commonStats.advanced; - return PlanStage::ADVANCED; - } + if (isEOF()) { + return PlanStage::IS_EOF; } - bool GroupStage::isEOF() { - return _groupState == GroupState_Done; + // On the first call to work(), call initGroupScripting(). + if (_groupState == GroupState_Initializing) { + initGroupScripting(); + _groupState = GroupState_ReadingFromChild; + ++_commonStats.needTime; + return PlanStage::NEED_TIME; } - void GroupStage::saveState() { - _txn = NULL; - ++_commonStats.yields; - _child->saveState(); - } + // Otherwise, read from our child. + invariant(_groupState == GroupState_ReadingFromChild); + WorkingSetID id = WorkingSet::INVALID_ID; + StageState state = _child->work(&id); + + if (PlanStage::NEED_TIME == state) { + ++_commonStats.needTime; + return state; + } else if (PlanStage::NEED_YIELD == state) { + ++_commonStats.needYield; + *out = id; + return state; + } else if (PlanStage::FAILURE == state) { + *out = id; + // If a stage fails, it may create a status WSM to indicate why it failed, in which + // case 'id' is valid. If ID is invalid, we create our own error message. + if (WorkingSet::INVALID_ID == id) { + const std::string errmsg = "group stage failed to read in results from child"; + *out = WorkingSetCommon::allocateStatusMember( + _ws, Status(ErrorCodes::InternalError, errmsg)); + } + return state; + } else if (PlanStage::DEAD == state) { + return state; + } else if (PlanStage::ADVANCED == state) { + WorkingSetMember* member = _ws->get(id); + // Group queries can't have projections. This means that covering analysis will always + // add a fetch. We should always get fetched data, and never just key data. + invariant(member->hasObj()); + + Status status = processObject(member->obj.value()); + if (!status.isOK()) { + *out = WorkingSetCommon::allocateStatusMember(_ws, status); + return PlanStage::FAILURE; + } - void GroupStage::restoreState(OperationContext* opCtx) { - invariant(_txn == NULL); - _txn = opCtx; - ++_commonStats.unyields; - _child->restoreState(opCtx); - } + _ws->free(id); - void GroupStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { - ++_commonStats.invalidates; - _child->invalidate(txn, dl, type); - } + ++_commonStats.needTime; + return PlanStage::NEED_TIME; + } else { + // We're done reading from our child. + invariant(PlanStage::IS_EOF == state); - vector<PlanStage*> GroupStage::getChildren() const { - vector<PlanStage*> children; - children.push_back(_child.get()); - return children; - } + // Transition to state "done." Future calls to work() will return IS_EOF. + _groupState = GroupState_Done; - PlanStageStats* GroupStage::getStats() { - _commonStats.isEOF = isEOF(); - unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_GROUP)); - GroupStats* groupStats = new GroupStats(_specificStats); - ret->specific.reset(groupStats); - ret->children.push_back(_child->getStats()); - return ret.release(); - } + BSONObj results = finalizeResults(); - const CommonStats* GroupStage::getCommonStats() const { - return &_commonStats; - } + *out = _ws->allocate(); + WorkingSetMember* member = _ws->get(*out); + member->obj = Snapshotted<BSONObj>(SnapshotId(), results); + member->state = WorkingSetMember::OWNED_OBJ; - const SpecificStats* GroupStage::getSpecificStats() const { - return &_specificStats; + ++_commonStats.advanced; + return PlanStage::ADVANCED; } +} + +bool GroupStage::isEOF() { + return _groupState == GroupState_Done; +} + +void GroupStage::saveState() { + _txn = NULL; + ++_commonStats.yields; + _child->saveState(); +} + +void GroupStage::restoreState(OperationContext* opCtx) { + invariant(_txn == NULL); + _txn = opCtx; + ++_commonStats.unyields; + _child->restoreState(opCtx); +} + +void GroupStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + ++_commonStats.invalidates; + _child->invalidate(txn, dl, type); +} + +vector<PlanStage*> GroupStage::getChildren() const { + vector<PlanStage*> children; + children.push_back(_child.get()); + return children; +} + +PlanStageStats* GroupStage::getStats() { + _commonStats.isEOF = isEOF(); + unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_GROUP)); + GroupStats* groupStats = new GroupStats(_specificStats); + ret->specific.reset(groupStats); + ret->children.push_back(_child->getStats()); + return ret.release(); +} + +const CommonStats* GroupStage::getCommonStats() const { + return &_commonStats; +} + +const SpecificStats* GroupStage::getSpecificStats() const { + return &_specificStats; +} } // namespace mongo diff --git a/src/mongo/db/exec/group.h b/src/mongo/db/exec/group.h index b9f525b95bb..49f5c881d84 100644 --- a/src/mongo/db/exec/group.h +++ b/src/mongo/db/exec/group.h @@ -34,135 +34,138 @@ namespace mongo { - class Collection; +class Collection; - /** - * A description of a request for a group operation. Copyable. - */ - struct GroupRequest { - // Namespace to operate on (e.g. "foo.bar"). - std::string ns; +/** + * A description of a request for a group operation. Copyable. + */ +struct GroupRequest { + // Namespace to operate on (e.g. "foo.bar"). + std::string ns; - // A predicate describing the set of documents to group. - BSONObj query; + // A predicate describing the set of documents to group. + BSONObj query; - // The field(s) to group by. Alternative to "keyFunctionCode". Empty if "keyFunctionCode" - // is being used instead. - BSONObj keyPattern; + // The field(s) to group by. Alternative to "keyFunctionCode". Empty if "keyFunctionCode" + // is being used instead. + BSONObj keyPattern; - // A Javascript function that maps a document to a key object. Alternative to "keyPattern". - // Empty is "keyPattern" is being used instead. - std::string keyFunctionCode; + // A Javascript function that maps a document to a key object. Alternative to "keyPattern". + // Empty is "keyPattern" is being used instead. + std::string keyFunctionCode; - // A Javascript function that takes a (input document, group result) pair and - // updates the group result document. - std::string reduceCode; + // A Javascript function that takes a (input document, group result) pair and + // updates the group result document. + std::string reduceCode; - // Scope for the reduce function. Optional. - BSONObj reduceScope; + // Scope for the reduce function. Optional. + BSONObj reduceScope; - // The initial value for the group result. - BSONObj initial; + // The initial value for the group result. + BSONObj initial; - // A Javascript function that "finalizes" a group result. Optional. - std::string finalize; + // A Javascript function that "finalizes" a group result. Optional. + std::string finalize; - // Whether this is an explain of a group. - bool explain; - }; + // Whether this is an explain of a group. + bool explain; +}; - /** - * Stage used by the group command. Consumes input documents from its child stage (returning - * NEED_TIME once for each document produced by the child), returns ADVANCED exactly once with - * the entire group result, then returns EOF. - * - * Only created through the getExecutorGroup path. - */ - class GroupStage: public PlanStage { - MONGO_DISALLOW_COPYING(GroupStage); - public: - GroupStage(OperationContext* txn, - const GroupRequest& request, - WorkingSet* workingSet, - PlanStage* child); - virtual ~GroupStage() { } +/** + * Stage used by the group command. Consumes input documents from its child stage (returning + * NEED_TIME once for each document produced by the child), returns ADVANCED exactly once with + * the entire group result, then returns EOF. + * + * Only created through the getExecutorGroup path. + */ +class GroupStage : public PlanStage { + MONGO_DISALLOW_COPYING(GroupStage); - virtual StageState work(WorkingSetID* out); - virtual bool isEOF(); - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); +public: + GroupStage(OperationContext* txn, + const GroupRequest& request, + WorkingSet* workingSet, + PlanStage* child); + virtual ~GroupStage() {} - virtual std::vector<PlanStage*> getChildren() const; + virtual StageState work(WorkingSetID* out); + virtual bool isEOF(); + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - virtual StageType stageType() const { return STAGE_GROUP; } + virtual std::vector<PlanStage*> getChildren() const; - virtual PlanStageStats* getStats(); + virtual StageType stageType() const { + return STAGE_GROUP; + } - virtual const CommonStats* getCommonStats() const; + virtual PlanStageStats* getStats(); - virtual const SpecificStats* getSpecificStats() const; + virtual const CommonStats* getCommonStats() const; - static const char* kStageType; + virtual const SpecificStats* getSpecificStats() const; - private: - /** - * Keeps track of what this group is currently doing so that it can do the right thing on - * the next call to work(). - */ - enum GroupState { - // Need to initialize the underlying Javascript machinery. - GroupState_Initializing, + static const char* kStageType; - // Retrieving the next document from the child stage and processing it. - GroupState_ReadingFromChild, +private: + /** + * Keeps track of what this group is currently doing so that it can do the right thing on + * the next call to work(). + */ + enum GroupState { + // Need to initialize the underlying Javascript machinery. + GroupState_Initializing, - // Results have been returned. - GroupState_Done - }; + // Retrieving the next document from the child stage and processing it. + GroupState_ReadingFromChild, - // Initializes _scope, _reduceFunction and _keyFunction using the global scripting engine. - void initGroupScripting(); + // Results have been returned. + GroupState_Done + }; - // Updates _groupMap and _scope to account for the group key associated with this object. - // Returns an error status if an error occurred, else Status::OK(). - Status processObject(const BSONObj& obj); + // Initializes _scope, _reduceFunction and _keyFunction using the global scripting engine. + void initGroupScripting(); - // Finalize the results for this group operation. Returns an owned BSONObj with the results - // array. - BSONObj finalizeResults(); + // Updates _groupMap and _scope to account for the group key associated with this object. + // Returns an error status if an error occurred, else Status::OK(). + Status processObject(const BSONObj& obj); - // Transactional context for read locks. Not owned by us. - OperationContext* _txn; + // Finalize the results for this group operation. Returns an owned BSONObj with the results + // array. + BSONObj finalizeResults(); - GroupRequest _request; + // Transactional context for read locks. Not owned by us. + OperationContext* _txn; - // The WorkingSet we annotate with results. Not owned by us. - WorkingSet* _ws; + GroupRequest _request; - CommonStats _commonStats; - GroupStats _specificStats; + // The WorkingSet we annotate with results. Not owned by us. + WorkingSet* _ws; - std::unique_ptr<PlanStage> _child; + CommonStats _commonStats; + GroupStats _specificStats; - // Current state for this stage. - GroupState _groupState; + std::unique_ptr<PlanStage> _child; - // The Scope object that all script operations for this group stage will use. Initialized - // by initGroupScripting(). Owned here. - std::unique_ptr<Scope> _scope; + // Current state for this stage. + GroupState _groupState; - // The reduce function for the group operation. Initialized by initGroupScripting(). Owned - // by _scope. - ScriptingFunction _reduceFunction; + // The Scope object that all script operations for this group stage will use. Initialized + // by initGroupScripting(). Owned here. + std::unique_ptr<Scope> _scope; - // The key function for the group operation if one was provided by the user, else 0. - // Initialized by initGroupScripting(). Owned by _scope. - ScriptingFunction _keyFunction; + // The reduce function for the group operation. Initialized by initGroupScripting(). Owned + // by _scope. + ScriptingFunction _reduceFunction; - // Map from group key => group index. The group index is used to index into "$arr", a - // variable owned by _scope which contains the group data for this key. - std::map<BSONObj, int, BSONObjCmp> _groupMap; - }; + // The key function for the group operation if one was provided by the user, else 0. + // Initialized by initGroupScripting(). Owned by _scope. + ScriptingFunction _keyFunction; + + // Map from group key => group index. The group index is used to index into "$arr", a + // variable owned by _scope which contains the group data for this key. + std::map<BSONObj, int, BSONObjCmp> _groupMap; +}; } // namespace mongo diff --git a/src/mongo/db/exec/idhack.cpp b/src/mongo/db/exec/idhack.cpp index 73a950e4dc2..dd5c36622e1 100644 --- a/src/mongo/db/exec/idhack.cpp +++ b/src/mongo/db/exec/idhack.cpp @@ -42,215 +42,221 @@ namespace mongo { - using std::unique_ptr; - using std::vector; - - // static - const char* IDHackStage::kStageType = "IDHACK"; - - IDHackStage::IDHackStage(OperationContext* txn, const Collection* collection, - CanonicalQuery* query, WorkingSet* ws) - : _txn(txn), - _collection(collection), - _workingSet(ws), - _key(query->getQueryObj()["_id"].wrap()), - _done(false), - _idBeingPagedIn(WorkingSet::INVALID_ID), - _commonStats(kStageType) { - if (NULL != query->getProj()) { - _addKeyMetadata = query->getProj()->wantIndexKey(); - } - else { - _addKeyMetadata = false; - } +using std::unique_ptr; +using std::vector; + +// static +const char* IDHackStage::kStageType = "IDHACK"; + +IDHackStage::IDHackStage(OperationContext* txn, + const Collection* collection, + CanonicalQuery* query, + WorkingSet* ws) + : _txn(txn), + _collection(collection), + _workingSet(ws), + _key(query->getQueryObj()["_id"].wrap()), + _done(false), + _idBeingPagedIn(WorkingSet::INVALID_ID), + _commonStats(kStageType) { + if (NULL != query->getProj()) { + _addKeyMetadata = query->getProj()->wantIndexKey(); + } else { + _addKeyMetadata = false; } - - IDHackStage::IDHackStage(OperationContext* txn, Collection* collection, - const BSONObj& key, WorkingSet* ws) - : _txn(txn), - _collection(collection), - _workingSet(ws), - _key(key), - _done(false), - _addKeyMetadata(false), - _idBeingPagedIn(WorkingSet::INVALID_ID), - _commonStats(kStageType) { } - - IDHackStage::~IDHackStage() { } - - bool IDHackStage::isEOF() { - if (WorkingSet::INVALID_ID != _idBeingPagedIn) { - // We asked the parent for a page-in, but still haven't had a chance to return the - // paged in document - return false; - } - - return _done; +} + +IDHackStage::IDHackStage(OperationContext* txn, + Collection* collection, + const BSONObj& key, + WorkingSet* ws) + : _txn(txn), + _collection(collection), + _workingSet(ws), + _key(key), + _done(false), + _addKeyMetadata(false), + _idBeingPagedIn(WorkingSet::INVALID_ID), + _commonStats(kStageType) {} + +IDHackStage::~IDHackStage() {} + +bool IDHackStage::isEOF() { + if (WorkingSet::INVALID_ID != _idBeingPagedIn) { + // We asked the parent for a page-in, but still haven't had a chance to return the + // paged in document + return false; } - PlanStage::StageState IDHackStage::work(WorkingSetID* out) { - ++_commonStats.works; - - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); + return _done; +} - if (_done) { return PlanStage::IS_EOF; } +PlanStage::StageState IDHackStage::work(WorkingSetID* out) { + ++_commonStats.works; - if (WorkingSet::INVALID_ID != _idBeingPagedIn) { - invariant(_recordCursor); - WorkingSetID id = _idBeingPagedIn; - _idBeingPagedIn = WorkingSet::INVALID_ID; - WorkingSetMember* member = _workingSet->get(id); + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); - invariant(WorkingSetCommon::fetchIfUnfetched(_txn, member, _recordCursor)); + if (_done) { + return PlanStage::IS_EOF; + } - return advance(id, member, out); - } + if (WorkingSet::INVALID_ID != _idBeingPagedIn) { + invariant(_recordCursor); + WorkingSetID id = _idBeingPagedIn; + _idBeingPagedIn = WorkingSet::INVALID_ID; + WorkingSetMember* member = _workingSet->get(id); - WorkingSetID id = WorkingSet::INVALID_ID; - try { - // Use the index catalog to get the id index. - const IndexCatalog* catalog = _collection->getIndexCatalog(); - - // Find the index we use. - IndexDescriptor* idDesc = catalog->findIdIndex(_txn); - if (NULL == idDesc) { - _done = true; - return PlanStage::IS_EOF; - } - - // Look up the key by going directly to the index. - RecordId loc = catalog->getIndex(idDesc)->findSingle(_txn, _key); - - // Key not found. - if (loc.isNull()) { - _done = true; - return PlanStage::IS_EOF; - } - - ++_specificStats.keysExamined; - ++_specificStats.docsExamined; - - // Create a new WSM for the result document. - id = _workingSet->allocate(); - WorkingSetMember* member = _workingSet->get(id); - member->state = WorkingSetMember::LOC_AND_IDX; - member->loc = loc; - - if (!_recordCursor) _recordCursor = _collection->getCursor(_txn); - - // We may need to request a yield while we fetch the document. - if (auto fetcher = _recordCursor->fetcherForId(loc)) { - // There's something to fetch. Hand the fetcher off to the WSM, and pass up a - // fetch request. - _idBeingPagedIn = id; - member->setFetcher(fetcher.release()); - *out = id; - _commonStats.needYield++; - return NEED_YIELD; - } - - // The doc was already in memory, so we go ahead and return it. - if (!WorkingSetCommon::fetch(_txn, member, _recordCursor)) { - // _id is immutable so the index would return the only record that could - // possibly match the query. - _workingSet->free(id); - _commonStats.isEOF = true; - _done = true; - return IS_EOF; - } - - return advance(id, member, out); - } - catch (const WriteConflictException& wce) { - // Restart at the beginning on retry. - _recordCursor.reset(); - if (id != WorkingSet::INVALID_ID) - _workingSet->free(id); + invariant(WorkingSetCommon::fetchIfUnfetched(_txn, member, _recordCursor)); - *out = WorkingSet::INVALID_ID; - _commonStats.needYield++; - return NEED_YIELD; - } + return advance(id, member, out); } - PlanStage::StageState IDHackStage::advance(WorkingSetID id, - WorkingSetMember* member, - WorkingSetID* out) { - invariant(member->hasObj()); + WorkingSetID id = WorkingSet::INVALID_ID; + try { + // Use the index catalog to get the id index. + const IndexCatalog* catalog = _collection->getIndexCatalog(); - if (_addKeyMetadata) { - BSONObjBuilder bob; - BSONObj ownedKeyObj = member->obj.value()["_id"].wrap().getOwned(); - bob.appendKeys(_key, ownedKeyObj); - member->addComputed(new IndexKeyComputedData(bob.obj())); + // Find the index we use. + IndexDescriptor* idDesc = catalog->findIdIndex(_txn); + if (NULL == idDesc) { + _done = true; + return PlanStage::IS_EOF; } - _done = true; - ++_commonStats.advanced; - *out = id; - return PlanStage::ADVANCED; - } - - void IDHackStage::saveState() { - _txn = NULL; - ++_commonStats.yields; - if (_recordCursor) _recordCursor->saveUnpositioned(); - } + // Look up the key by going directly to the index. + RecordId loc = catalog->getIndex(idDesc)->findSingle(_txn, _key); - void IDHackStage::restoreState(OperationContext* opCtx) { - invariant(_txn == NULL); - _txn = opCtx; - ++_commonStats.unyields; - if (_recordCursor) _recordCursor->restore(opCtx); - } - - void IDHackStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { - ++_commonStats.invalidates; + // Key not found. + if (loc.isNull()) { + _done = true; + return PlanStage::IS_EOF; + } - // Since updates can't mutate the '_id' field, we can ignore mutation invalidations. - if (INVALIDATION_MUTATION == type) { - return; + ++_specificStats.keysExamined; + ++_specificStats.docsExamined; + + // Create a new WSM for the result document. + id = _workingSet->allocate(); + WorkingSetMember* member = _workingSet->get(id); + member->state = WorkingSetMember::LOC_AND_IDX; + member->loc = loc; + + if (!_recordCursor) + _recordCursor = _collection->getCursor(_txn); + + // We may need to request a yield while we fetch the document. + if (auto fetcher = _recordCursor->fetcherForId(loc)) { + // There's something to fetch. Hand the fetcher off to the WSM, and pass up a + // fetch request. + _idBeingPagedIn = id; + member->setFetcher(fetcher.release()); + *out = id; + _commonStats.needYield++; + return NEED_YIELD; } - // It's possible that the loc getting invalidated is the one we're about to - // fetch. In this case we do a "forced fetch" and put the WSM in owned object state. - if (WorkingSet::INVALID_ID != _idBeingPagedIn) { - WorkingSetMember* member = _workingSet->get(_idBeingPagedIn); - if (member->hasLoc() && (member->loc == dl)) { - // Fetch it now and kill the diskloc. - WorkingSetCommon::fetchAndInvalidateLoc(txn, member, _collection); - } + // The doc was already in memory, so we go ahead and return it. + if (!WorkingSetCommon::fetch(_txn, member, _recordCursor)) { + // _id is immutable so the index would return the only record that could + // possibly match the query. + _workingSet->free(id); + _commonStats.isEOF = true; + _done = true; + return IS_EOF; } - } - // static - bool IDHackStage::supportsQuery(const CanonicalQuery& query) { - return !query.getParsed().showRecordId() - && query.getParsed().getHint().isEmpty() - && 0 == query.getParsed().getSkip() - && CanonicalQuery::isSimpleIdQuery(query.getParsed().getFilter()) - && !query.getParsed().isTailable(); - } + return advance(id, member, out); + } catch (const WriteConflictException& wce) { + // Restart at the beginning on retry. + _recordCursor.reset(); + if (id != WorkingSet::INVALID_ID) + _workingSet->free(id); - vector<PlanStage*> IDHackStage::getChildren() const { - vector<PlanStage*> empty; - return empty; + *out = WorkingSet::INVALID_ID; + _commonStats.needYield++; + return NEED_YIELD; } - - PlanStageStats* IDHackStage::getStats() { - _commonStats.isEOF = isEOF(); - unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_IDHACK)); - ret->specific.reset(new IDHackStats(_specificStats)); - return ret.release(); +} + +PlanStage::StageState IDHackStage::advance(WorkingSetID id, + WorkingSetMember* member, + WorkingSetID* out) { + invariant(member->hasObj()); + + if (_addKeyMetadata) { + BSONObjBuilder bob; + BSONObj ownedKeyObj = member->obj.value()["_id"].wrap().getOwned(); + bob.appendKeys(_key, ownedKeyObj); + member->addComputed(new IndexKeyComputedData(bob.obj())); } - const CommonStats* IDHackStage::getCommonStats() const { - return &_commonStats; + _done = true; + ++_commonStats.advanced; + *out = id; + return PlanStage::ADVANCED; +} + +void IDHackStage::saveState() { + _txn = NULL; + ++_commonStats.yields; + if (_recordCursor) + _recordCursor->saveUnpositioned(); +} + +void IDHackStage::restoreState(OperationContext* opCtx) { + invariant(_txn == NULL); + _txn = opCtx; + ++_commonStats.unyields; + if (_recordCursor) + _recordCursor->restore(opCtx); +} + +void IDHackStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + ++_commonStats.invalidates; + + // Since updates can't mutate the '_id' field, we can ignore mutation invalidations. + if (INVALIDATION_MUTATION == type) { + return; } - const SpecificStats* IDHackStage::getSpecificStats() const { - return &_specificStats; + // It's possible that the loc getting invalidated is the one we're about to + // fetch. In this case we do a "forced fetch" and put the WSM in owned object state. + if (WorkingSet::INVALID_ID != _idBeingPagedIn) { + WorkingSetMember* member = _workingSet->get(_idBeingPagedIn); + if (member->hasLoc() && (member->loc == dl)) { + // Fetch it now and kill the diskloc. + WorkingSetCommon::fetchAndInvalidateLoc(txn, member, _collection); + } } +} + +// static +bool IDHackStage::supportsQuery(const CanonicalQuery& query) { + return !query.getParsed().showRecordId() && query.getParsed().getHint().isEmpty() && + 0 == query.getParsed().getSkip() && + CanonicalQuery::isSimpleIdQuery(query.getParsed().getFilter()) && + !query.getParsed().isTailable(); +} + +vector<PlanStage*> IDHackStage::getChildren() const { + vector<PlanStage*> empty; + return empty; +} + +PlanStageStats* IDHackStage::getStats() { + _commonStats.isEOF = isEOF(); + unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_IDHACK)); + ret->specific.reset(new IDHackStats(_specificStats)); + return ret.release(); +} + +const CommonStats* IDHackStage::getCommonStats() const { + return &_commonStats; +} + +const SpecificStats* IDHackStage::getSpecificStats() const { + return &_specificStats; +} } // namespace mongo diff --git a/src/mongo/db/exec/idhack.h b/src/mongo/db/exec/idhack.h index 5430addd12b..b4dc87c1b84 100644 --- a/src/mongo/db/exec/idhack.h +++ b/src/mongo/db/exec/idhack.h @@ -37,83 +37,86 @@ namespace mongo { - class RecordCursor; +class RecordCursor; - /** - * A standalone stage implementing the fast path for key-value retrievals - * via the _id index. - */ - class IDHackStage : public PlanStage { - public: - /** Takes ownership of all the arguments -collection. */ - IDHackStage(OperationContext* txn, const Collection* collection, - CanonicalQuery* query, WorkingSet* ws); +/** + * A standalone stage implementing the fast path for key-value retrievals + * via the _id index. + */ +class IDHackStage : public PlanStage { +public: + /** Takes ownership of all the arguments -collection. */ + IDHackStage(OperationContext* txn, + const Collection* collection, + CanonicalQuery* query, + WorkingSet* ws); - IDHackStage(OperationContext* txn, Collection* collection, - const BSONObj& key, WorkingSet* ws); + IDHackStage(OperationContext* txn, Collection* collection, const BSONObj& key, WorkingSet* ws); - virtual ~IDHackStage(); + virtual ~IDHackStage(); - virtual bool isEOF(); - virtual StageState work(WorkingSetID* out); + virtual bool isEOF(); + virtual StageState work(WorkingSetID* out); - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - /** - * ID Hack has a very strict criteria for the queries it supports. - */ - static bool supportsQuery(const CanonicalQuery& query); + /** + * ID Hack has a very strict criteria for the queries it supports. + */ + static bool supportsQuery(const CanonicalQuery& query); - virtual std::vector<PlanStage*> getChildren() const; + virtual std::vector<PlanStage*> getChildren() const; - virtual StageType stageType() const { return STAGE_IDHACK; } + virtual StageType stageType() const { + return STAGE_IDHACK; + } - PlanStageStats* getStats(); + PlanStageStats* getStats(); - virtual const CommonStats* getCommonStats() const; + virtual const CommonStats* getCommonStats() const; - virtual const SpecificStats* getSpecificStats() const; + virtual const SpecificStats* getSpecificStats() const; - static const char* kStageType; + static const char* kStageType; - private: - /** - * Marks this stage as done, optionally adds key metadata, and returns PlanStage::ADVANCED. - * - * Called whenever we have a WSM containing the matching obj. - */ - StageState advance(WorkingSetID id, WorkingSetMember* member, WorkingSetID* out); +private: + /** + * Marks this stage as done, optionally adds key metadata, and returns PlanStage::ADVANCED. + * + * Called whenever we have a WSM containing the matching obj. + */ + StageState advance(WorkingSetID id, WorkingSetMember* member, WorkingSetID* out); - // transactional context for read locks. Not owned by us - OperationContext* _txn; + // transactional context for read locks. Not owned by us + OperationContext* _txn; - // Not owned here. - const Collection* _collection; + // Not owned here. + const Collection* _collection; - std::unique_ptr<RecordCursor> _recordCursor; + std::unique_ptr<RecordCursor> _recordCursor; - // The WorkingSet we annotate with results. Not owned by us. - WorkingSet* _workingSet; + // The WorkingSet we annotate with results. Not owned by us. + WorkingSet* _workingSet; - // The value to match against the _id field. - BSONObj _key; + // The value to match against the _id field. + BSONObj _key; - // Have we returned our one document? - bool _done; + // Have we returned our one document? + bool _done; - // Do we need to add index key metadata for $returnKey? - bool _addKeyMetadata; + // Do we need to add index key metadata for $returnKey? + bool _addKeyMetadata; - // If we want to return a RecordId and it points to something that's not in memory, - // we return a "please page this in" result. We add a RecordFetcher given back to us by the - // storage engine to the WSM. The RecordFetcher is used by the PlanExecutor when it handles - // the fetch request. - WorkingSetID _idBeingPagedIn; + // If we want to return a RecordId and it points to something that's not in memory, + // we return a "please page this in" result. We add a RecordFetcher given back to us by the + // storage engine to the WSM. The RecordFetcher is used by the PlanExecutor when it handles + // the fetch request. + WorkingSetID _idBeingPagedIn; - CommonStats _commonStats; - IDHackStats _specificStats; - }; + CommonStats _commonStats; + IDHackStats _specificStats; +}; } // namespace mongo diff --git a/src/mongo/db/exec/index_scan.cpp b/src/mongo/db/exec/index_scan.cpp index 0dda64c99d2..b7a963d3fd8 100644 --- a/src/mongo/db/exec/index_scan.cpp +++ b/src/mongo/db/exec/index_scan.cpp @@ -43,133 +43,130 @@ namespace { - // Return a value in the set {-1, 0, 1} to represent the sign of parameter i. - int sgn(int i) { - if (i == 0) - return 0; - return i > 0 ? 1 : -1; - } +// Return a value in the set {-1, 0, 1} to represent the sign of parameter i. +int sgn(int i) { + if (i == 0) + return 0; + return i > 0 ? 1 : -1; +} } // namespace namespace mongo { - // static - const char* IndexScan::kStageType = "IXSCAN"; - - IndexScan::IndexScan(OperationContext* txn, - const IndexScanParams& params, - WorkingSet* workingSet, - const MatchExpression* filter) - : _txn(txn), - _workingSet(workingSet), - _iam(params.descriptor->getIndexCatalog()->getIndex(params.descriptor)), - _keyPattern(params.descriptor->keyPattern().getOwned()), - _scanState(INITIALIZING), - _filter(filter), - _shouldDedup(true), - _forward(params.direction == 1), - _params(params), - _commonStats(kStageType), - _endKeyInclusive(false) { - - // We can't always access the descriptor in the call to getStats() so we pull - // any info we need for stats reporting out here. - _specificStats.keyPattern = _keyPattern; - _specificStats.indexName = _params.descriptor->indexName(); - _specificStats.isMultiKey = _params.descriptor->isMultikey(_txn); - _specificStats.isUnique = _params.descriptor->unique(); - _specificStats.isSparse = _params.descriptor->isSparse(); - _specificStats.isPartial = _params.descriptor->isPartial(); - _specificStats.indexVersion = _params.descriptor->version(); +// static +const char* IndexScan::kStageType = "IXSCAN"; + +IndexScan::IndexScan(OperationContext* txn, + const IndexScanParams& params, + WorkingSet* workingSet, + const MatchExpression* filter) + : _txn(txn), + _workingSet(workingSet), + _iam(params.descriptor->getIndexCatalog()->getIndex(params.descriptor)), + _keyPattern(params.descriptor->keyPattern().getOwned()), + _scanState(INITIALIZING), + _filter(filter), + _shouldDedup(true), + _forward(params.direction == 1), + _params(params), + _commonStats(kStageType), + _endKeyInclusive(false) { + // We can't always access the descriptor in the call to getStats() so we pull + // any info we need for stats reporting out here. + _specificStats.keyPattern = _keyPattern; + _specificStats.indexName = _params.descriptor->indexName(); + _specificStats.isMultiKey = _params.descriptor->isMultikey(_txn); + _specificStats.isUnique = _params.descriptor->unique(); + _specificStats.isSparse = _params.descriptor->isSparse(); + _specificStats.isPartial = _params.descriptor->isPartial(); + _specificStats.indexVersion = _params.descriptor->version(); +} + +boost::optional<IndexKeyEntry> IndexScan::initIndexScan() { + if (_params.doNotDedup) { + _shouldDedup = false; + } else { + // TODO it is incorrect to rely on this not changing. SERVER-17678 + _shouldDedup = _params.descriptor->isMultikey(_txn); } - boost::optional<IndexKeyEntry> IndexScan::initIndexScan() { - if (_params.doNotDedup) { - _shouldDedup = false; - } - else { - // TODO it is incorrect to rely on this not changing. SERVER-17678 - _shouldDedup = _params.descriptor->isMultikey(_txn); - } + // Perform the possibly heavy-duty initialization of the underlying index cursor. + _indexCursor = _iam->newCursor(_txn, _forward); + + if (_params.bounds.isSimpleRange) { + // Start at one key, end at another. + _endKey = _params.bounds.endKey; + _endKeyInclusive = _params.bounds.endKeyInclusive; + _indexCursor->setEndPosition(_endKey, _endKeyInclusive); + return _indexCursor->seek(_params.bounds.startKey, /*inclusive*/ true); + } else { + // For single intervals, we can use an optimized scan which checks against the position + // of an end cursor. For all other index scans, we fall back on using + // IndexBoundsChecker to determine when we've finished the scan. + BSONObj startKey; + bool startKeyInclusive; + if (IndexBoundsBuilder::isSingleInterval( + _params.bounds, &startKey, &startKeyInclusive, &_endKey, &_endKeyInclusive)) { + _indexCursor->setEndPosition(_endKey, _endKeyInclusive); + return _indexCursor->seek(startKey, startKeyInclusive); + } else { + _checker.reset(new IndexBoundsChecker(&_params.bounds, _keyPattern, _params.direction)); - // Perform the possibly heavy-duty initialization of the underlying index cursor. - _indexCursor = _iam->newCursor(_txn, _forward); + if (!_checker->getStartSeekPoint(&_seekPoint)) + return boost::none; - if (_params.bounds.isSimpleRange) { - // Start at one key, end at another. - _endKey = _params.bounds.endKey; - _endKeyInclusive = _params.bounds.endKeyInclusive; - _indexCursor->setEndPosition(_endKey, _endKeyInclusive); - return _indexCursor->seek(_params.bounds.startKey, /*inclusive*/true); - } - else { - // For single intervals, we can use an optimized scan which checks against the position - // of an end cursor. For all other index scans, we fall back on using - // IndexBoundsChecker to determine when we've finished the scan. - BSONObj startKey; - bool startKeyInclusive; - if (IndexBoundsBuilder::isSingleInterval(_params.bounds, - &startKey, - &startKeyInclusive, - &_endKey, - &_endKeyInclusive)) { - - _indexCursor->setEndPosition(_endKey, _endKeyInclusive); - return _indexCursor->seek(startKey, startKeyInclusive); - } - else { - _checker.reset(new IndexBoundsChecker(&_params.bounds, - _keyPattern, - _params.direction)); - - if (!_checker->getStartSeekPoint(&_seekPoint)) - return boost::none; - - return _indexCursor->seek(_seekPoint); - } + return _indexCursor->seek(_seekPoint); } } +} + +PlanStage::StageState IndexScan::work(WorkingSetID* out) { + ++_commonStats.works; + + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); - PlanStage::StageState IndexScan::work(WorkingSetID* out) { - ++_commonStats.works; - - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); - - // Get the next kv pair from the index, if any. - boost::optional<IndexKeyEntry> kv; - try { - switch (_scanState) { - case INITIALIZING: kv = initIndexScan(); break; - case GETTING_NEXT: kv = _indexCursor->next(); break; - case NEED_SEEK: kv = _indexCursor->seek(_seekPoint); break; - case HIT_END: return PlanStage::IS_EOF; - } + // Get the next kv pair from the index, if any. + boost::optional<IndexKeyEntry> kv; + try { + switch (_scanState) { + case INITIALIZING: + kv = initIndexScan(); + break; + case GETTING_NEXT: + kv = _indexCursor->next(); + break; + case NEED_SEEK: + kv = _indexCursor->seek(_seekPoint); + break; + case HIT_END: + return PlanStage::IS_EOF; } - catch (const WriteConflictException& wce) { - *out = WorkingSet::INVALID_ID; - return PlanStage::NEED_YIELD; + } catch (const WriteConflictException& wce) { + *out = WorkingSet::INVALID_ID; + return PlanStage::NEED_YIELD; + } + + if (kv) { + // In debug mode, check that the cursor isn't lying to us. + if (kDebugBuild && !_endKey.isEmpty()) { + int cmp = kv->key.woCompare(_endKey, + Ordering::make(_params.descriptor->keyPattern()), + /*compareFieldNames*/ false); + if (cmp == 0) + dassert(_endKeyInclusive); + dassert(_forward ? cmp <= 0 : cmp >= 0); } - if (kv) { - // In debug mode, check that the cursor isn't lying to us. - if (kDebugBuild && !_endKey.isEmpty()) { - int cmp = kv->key.woCompare(_endKey, - Ordering::make(_params.descriptor->keyPattern()), - /*compareFieldNames*/false); - if (cmp == 0) dassert(_endKeyInclusive); - dassert(_forward ? cmp <= 0 : cmp >= 0); - } - - ++_specificStats.keysExamined; - if (_params.maxScan && _specificStats.keysExamined >= _params.maxScan) { - kv = boost::none; - } + ++_specificStats.keysExamined; + if (_params.maxScan && _specificStats.keysExamined >= _params.maxScan) { + kv = boost::none; } + } - if (kv && _checker) { - switch (_checker->checkKey(kv->key, &_seekPoint)) { + if (kv && _checker) { + switch (_checker->checkKey(kv->key, &_seekPoint)) { case IndexBoundsChecker::VALID: break; @@ -181,138 +178,141 @@ namespace mongo { _scanState = NEED_SEEK; _commonStats.needTime++; return PlanStage::NEED_TIME; - } } + } - if (!kv) { - _scanState = HIT_END; - _commonStats.isEOF = true; - _indexCursor.reset(); - return PlanStage::IS_EOF; - } + if (!kv) { + _scanState = HIT_END; + _commonStats.isEOF = true; + _indexCursor.reset(); + return PlanStage::IS_EOF; + } - _scanState = GETTING_NEXT; + _scanState = GETTING_NEXT; - if (_shouldDedup) { - ++_specificStats.dupsTested; - if (!_returned.insert(kv->loc).second) { - // We've seen this RecordId before. Skip it this time. - ++_specificStats.dupsDropped; - ++_commonStats.needTime; - return PlanStage::NEED_TIME; - } + if (_shouldDedup) { + ++_specificStats.dupsTested; + if (!_returned.insert(kv->loc).second) { + // We've seen this RecordId before. Skip it this time. + ++_specificStats.dupsDropped; + ++_commonStats.needTime; + return PlanStage::NEED_TIME; } + } - if (_filter) { - if (!Filter::passes(kv->key, _keyPattern, _filter)) { - ++_commonStats.needTime; - return PlanStage::NEED_TIME; - } + if (_filter) { + if (!Filter::passes(kv->key, _keyPattern, _filter)) { + ++_commonStats.needTime; + return PlanStage::NEED_TIME; } + } - if (!kv->key.isOwned()) kv->key = kv->key.getOwned(); - - // We found something to return, so fill out the WSM. - WorkingSetID id = _workingSet->allocate(); - WorkingSetMember* member = _workingSet->get(id); - member->loc = kv->loc; - member->keyData.push_back(IndexKeyDatum(_keyPattern, kv->key, _iam)); - member->state = WorkingSetMember::LOC_AND_IDX; + if (!kv->key.isOwned()) + kv->key = kv->key.getOwned(); - if (_params.addKeyMetadata) { - BSONObjBuilder bob; - bob.appendKeys(_keyPattern, kv->key); - member->addComputed(new IndexKeyComputedData(bob.obj())); - } + // We found something to return, so fill out the WSM. + WorkingSetID id = _workingSet->allocate(); + WorkingSetMember* member = _workingSet->get(id); + member->loc = kv->loc; + member->keyData.push_back(IndexKeyDatum(_keyPattern, kv->key, _iam)); + member->state = WorkingSetMember::LOC_AND_IDX; - *out = id; - ++_commonStats.advanced; - return PlanStage::ADVANCED; + if (_params.addKeyMetadata) { + BSONObjBuilder bob; + bob.appendKeys(_keyPattern, kv->key); + member->addComputed(new IndexKeyComputedData(bob.obj())); } - bool IndexScan::isEOF() { - return _commonStats.isEOF; - } + *out = id; + ++_commonStats.advanced; + return PlanStage::ADVANCED; +} - void IndexScan::saveState() { - if (!_txn) { - // We were already saved. Nothing to do. - return; - } +bool IndexScan::isEOF() { + return _commonStats.isEOF; +} - _txn = NULL; - ++_commonStats.yields; - if (!_indexCursor) return; +void IndexScan::saveState() { + if (!_txn) { + // We were already saved. Nothing to do. + return; + } - if (_scanState == NEED_SEEK) { - _indexCursor->saveUnpositioned(); - return; - } + _txn = NULL; + ++_commonStats.yields; + if (!_indexCursor) + return; - _indexCursor->savePositioned(); + if (_scanState == NEED_SEEK) { + _indexCursor->saveUnpositioned(); + return; } - void IndexScan::restoreState(OperationContext* opCtx) { - invariant(_txn == NULL); - _txn = opCtx; - ++_commonStats.unyields; + _indexCursor->savePositioned(); +} - if (_indexCursor) _indexCursor->restore(opCtx); - } +void IndexScan::restoreState(OperationContext* opCtx) { + invariant(_txn == NULL); + _txn = opCtx; + ++_commonStats.unyields; - void IndexScan::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { - ++_commonStats.invalidates; + if (_indexCursor) + _indexCursor->restore(opCtx); +} - // The only state we're responsible for holding is what RecordIds to drop. If a document - // mutates the underlying index cursor will deal with it. - if (INVALIDATION_MUTATION == type) { - return; - } +void IndexScan::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + ++_commonStats.invalidates; - // If we see this RecordId again, it may not be the same document it was before, so we want - // to return it if we see it again. - unordered_set<RecordId, RecordId::Hasher>::iterator it = _returned.find(dl); - if (it != _returned.end()) { - ++_specificStats.seenInvalidated; - _returned.erase(it); - } + // The only state we're responsible for holding is what RecordIds to drop. If a document + // mutates the underlying index cursor will deal with it. + if (INVALIDATION_MUTATION == type) { + return; } - std::vector<PlanStage*> IndexScan::getChildren() const { - return {}; + // If we see this RecordId again, it may not be the same document it was before, so we want + // to return it if we see it again. + unordered_set<RecordId, RecordId::Hasher>::iterator it = _returned.find(dl); + if (it != _returned.end()) { + ++_specificStats.seenInvalidated; + _returned.erase(it); } +} - PlanStageStats* IndexScan::getStats() { - // WARNING: this could be called even if the collection was dropped. Do not access any - // catalog information here. +std::vector<PlanStage*> IndexScan::getChildren() const { + return {}; +} - // Add a BSON representation of the filter to the stats tree, if there is one. - if (NULL != _filter) { - BSONObjBuilder bob; - _filter->toBSON(&bob); - _commonStats.filter = bob.obj(); - } +PlanStageStats* IndexScan::getStats() { + // WARNING: this could be called even if the collection was dropped. Do not access any + // catalog information here. - // These specific stats fields never change. - if (_specificStats.indexType.empty()) { - _specificStats.indexType = "BtreeCursor"; // TODO amName; + // Add a BSON representation of the filter to the stats tree, if there is one. + if (NULL != _filter) { + BSONObjBuilder bob; + _filter->toBSON(&bob); + _commonStats.filter = bob.obj(); + } - _specificStats.indexBounds = _params.bounds.toBSON(); + // These specific stats fields never change. + if (_specificStats.indexType.empty()) { + _specificStats.indexType = "BtreeCursor"; // TODO amName; - _specificStats.direction = _params.direction; - } + _specificStats.indexBounds = _params.bounds.toBSON(); - std::unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_IXSCAN)); - ret->specific.reset(new IndexScanStats(_specificStats)); - return ret.release(); + _specificStats.direction = _params.direction; } - const CommonStats* IndexScan::getCommonStats() const { - return &_commonStats; - } + std::unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_IXSCAN)); + ret->specific.reset(new IndexScanStats(_specificStats)); + return ret.release(); +} - const SpecificStats* IndexScan::getSpecificStats() const { - return &_specificStats; - } +const CommonStats* IndexScan::getCommonStats() const { + return &_commonStats; +} + +const SpecificStats* IndexScan::getSpecificStats() const { + return &_specificStats; +} } // namespace mongo diff --git a/src/mongo/db/exec/index_scan.h b/src/mongo/db/exec/index_scan.h index 555b00c4beb..d415c3b985b 100644 --- a/src/mongo/db/exec/index_scan.h +++ b/src/mongo/db/exec/index_scan.h @@ -41,145 +41,143 @@ namespace mongo { - class IndexAccessMethod; - class IndexDescriptor; - class WorkingSet; +class IndexAccessMethod; +class IndexDescriptor; +class WorkingSet; - struct IndexScanParams { - IndexScanParams() : descriptor(NULL), - direction(1), - doNotDedup(false), - maxScan(0), - addKeyMetadata(false) { } +struct IndexScanParams { + IndexScanParams() + : descriptor(NULL), direction(1), doNotDedup(false), maxScan(0), addKeyMetadata(false) {} - const IndexDescriptor* descriptor; + const IndexDescriptor* descriptor; - IndexBounds bounds; + IndexBounds bounds; - int direction; + int direction; - bool doNotDedup; + bool doNotDedup; - // How many keys will we look at? - size_t maxScan; + // How many keys will we look at? + size_t maxScan; - // Do we want to add the key as metadata? - bool addKeyMetadata; - }; + // Do we want to add the key as metadata? + bool addKeyMetadata; +}; +/** + * Stage scans over an index from startKey to endKey, returning results that pass the provided + * filter. Internally dedups on RecordId. + * + * Sub-stage preconditions: None. Is a leaf and consumes no stage data. + */ +class IndexScan : public PlanStage { +public: /** - * Stage scans over an index from startKey to endKey, returning results that pass the provided - * filter. Internally dedups on RecordId. - * - * Sub-stage preconditions: None. Is a leaf and consumes no stage data. + * Keeps track of what this index scan is currently doing so that it + * can do the right thing on the next call to work(). */ - class IndexScan : public PlanStage { - public: - - /** - * Keeps track of what this index scan is currently doing so that it - * can do the right thing on the next call to work(). - */ - enum ScanState { - // Need to initialize the underlying index traversal machinery. - INITIALIZING, + enum ScanState { + // Need to initialize the underlying index traversal machinery. + INITIALIZING, - // Skipping keys as directed by the _checker. - NEED_SEEK, + // Skipping keys as directed by the _checker. + NEED_SEEK, - // Retrieving the next key, and applying the filter if necessary. - GETTING_NEXT, + // Retrieving the next key, and applying the filter if necessary. + GETTING_NEXT, - // The index scan is finished. - HIT_END - }; + // The index scan is finished. + HIT_END + }; - IndexScan(OperationContext* txn, - const IndexScanParams& params, - WorkingSet* workingSet, - const MatchExpression* filter); + IndexScan(OperationContext* txn, + const IndexScanParams& params, + WorkingSet* workingSet, + const MatchExpression* filter); - virtual ~IndexScan() { } + virtual ~IndexScan() {} - virtual StageState work(WorkingSetID* out); - virtual bool isEOF(); - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + virtual StageState work(WorkingSetID* out); + virtual bool isEOF(); + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - virtual std::vector<PlanStage*> getChildren() const; + virtual std::vector<PlanStage*> getChildren() const; - virtual StageType stageType() const { return STAGE_IXSCAN; } + virtual StageType stageType() const { + return STAGE_IXSCAN; + } - virtual PlanStageStats* getStats(); + virtual PlanStageStats* getStats(); - virtual const CommonStats* getCommonStats() const; + virtual const CommonStats* getCommonStats() const; - virtual const SpecificStats* getSpecificStats() const; + virtual const SpecificStats* getSpecificStats() const; - static const char* kStageType; + static const char* kStageType; - private: - /** - * Initialize the underlying index Cursor, returning first result if any. - */ - boost::optional<IndexKeyEntry> initIndexScan(); +private: + /** + * Initialize the underlying index Cursor, returning first result if any. + */ + boost::optional<IndexKeyEntry> initIndexScan(); - // transactional context for read locks. Not owned by us - OperationContext* _txn; + // transactional context for read locks. Not owned by us + OperationContext* _txn; - // The WorkingSet we fill with results. Not owned by us. - WorkingSet* const _workingSet; + // The WorkingSet we fill with results. Not owned by us. + WorkingSet* const _workingSet; - // Index access. - const IndexAccessMethod* const _iam; // owned by Collection -> IndexCatalog - std::unique_ptr<SortedDataInterface::Cursor> _indexCursor; - const BSONObj _keyPattern; + // Index access. + const IndexAccessMethod* const _iam; // owned by Collection -> IndexCatalog + std::unique_ptr<SortedDataInterface::Cursor> _indexCursor; + const BSONObj _keyPattern; - // Keeps track of what work we need to do next. - ScanState _scanState; + // Keeps track of what work we need to do next. + ScanState _scanState; - // Contains expressions only over fields in the index key. We assume this is built - // correctly by whomever creates this class. - // The filter is not owned by us. - const MatchExpression* const _filter; + // Contains expressions only over fields in the index key. We assume this is built + // correctly by whomever creates this class. + // The filter is not owned by us. + const MatchExpression* const _filter; - // Could our index have duplicates? If so, we use _returned to dedup. - bool _shouldDedup; - unordered_set<RecordId, RecordId::Hasher> _returned; + // Could our index have duplicates? If so, we use _returned to dedup. + bool _shouldDedup; + unordered_set<RecordId, RecordId::Hasher> _returned; - const bool _forward; - const IndexScanParams _params; + const bool _forward; + const IndexScanParams _params; - // Stats - CommonStats _commonStats; - IndexScanStats _specificStats; + // Stats + CommonStats _commonStats; + IndexScanStats _specificStats; - // - // This class employs one of two different algorithms for determining when the index scan - // has reached the end: - // + // + // This class employs one of two different algorithms for determining when the index scan + // has reached the end: + // - // - // 1) If the index scan is not a single contiguous interval, then we use an - // IndexBoundsChecker to determine which keys to return and when to stop scanning. - // In this case, _checker will be non-NULL. - // + // + // 1) If the index scan is not a single contiguous interval, then we use an + // IndexBoundsChecker to determine which keys to return and when to stop scanning. + // In this case, _checker will be non-NULL. + // - std::unique_ptr<IndexBoundsChecker> _checker; - IndexSeekPoint _seekPoint; + std::unique_ptr<IndexBoundsChecker> _checker; + IndexSeekPoint _seekPoint; - // - // 2) If the index scan is a single contiguous interval, then the scan can execute faster by - // letting the index cursor tell us when it hits the end, rather than repeatedly doing - // BSON compares against scanned keys. In this case _checker will be NULL. - // + // + // 2) If the index scan is a single contiguous interval, then the scan can execute faster by + // letting the index cursor tell us when it hits the end, rather than repeatedly doing + // BSON compares against scanned keys. In this case _checker will be NULL. + // - // The key that the index cursor should stop on/after. - BSONObj _endKey; + // The key that the index cursor should stop on/after. + BSONObj _endKey; - // Is the end key included in the range? - bool _endKeyInclusive; - }; + // Is the end key included in the range? + bool _endKeyInclusive; +}; } // namespace mongo diff --git a/src/mongo/db/exec/keep_mutations.cpp b/src/mongo/db/exec/keep_mutations.cpp index 7c88a59fde5..c30d276782d 100644 --- a/src/mongo/db/exec/keep_mutations.cpp +++ b/src/mongo/db/exec/keep_mutations.cpp @@ -33,127 +33,127 @@ namespace mongo { - using std::unique_ptr; - using std::vector; - - // static - const char* KeepMutationsStage::kStageType = "KEEP_MUTATIONS"; - - KeepMutationsStage::KeepMutationsStage(const MatchExpression* filter, - WorkingSet* ws, - PlanStage* child) - : _workingSet(ws), - _child(child), - _filter(filter), - _doneReadingChild(false), - _doneReturningFlagged(false), - _commonStats(kStageType) { } - - KeepMutationsStage::~KeepMutationsStage() { } - - bool KeepMutationsStage::isEOF() { - return _doneReadingChild && _doneReturningFlagged; - } - - PlanStage::StageState KeepMutationsStage::work(WorkingSetID* out) { - ++_commonStats.works; +using std::unique_ptr; +using std::vector; - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); +// static +const char* KeepMutationsStage::kStageType = "KEEP_MUTATIONS"; - // If we've returned as many results as we're limited to, isEOF will be true. - if (isEOF()) { return PlanStage::IS_EOF; } +KeepMutationsStage::KeepMutationsStage(const MatchExpression* filter, + WorkingSet* ws, + PlanStage* child) + : _workingSet(ws), + _child(child), + _filter(filter), + _doneReadingChild(false), + _doneReturningFlagged(false), + _commonStats(kStageType) {} - // Stream child results until the child is all done. - if (!_doneReadingChild) { - StageState status = _child->work(out); +KeepMutationsStage::~KeepMutationsStage() {} - // Child is still returning results. Pass them through. - if (PlanStage::IS_EOF != status) { - if (PlanStage::ADVANCED == status) { - ++_commonStats.advanced; - } - else if (PlanStage::NEED_TIME == status) { - ++_commonStats.needTime; - } - else if (PlanStage::NEED_YIELD == status) { - ++_commonStats.needYield; - } +bool KeepMutationsStage::isEOF() { + return _doneReadingChild && _doneReturningFlagged; +} - return status; - } +PlanStage::StageState KeepMutationsStage::work(WorkingSetID* out) { + ++_commonStats.works; - // Child is EOF. We want to stream flagged results if there are any. - _doneReadingChild = true; - - // Read out all of the flagged results from the working set. We can't iterate through - // the working set's flagged result set directly, since it may be modified later if - // further documents are invalidated during a yield. - std::copy(_workingSet->getFlagged().begin(), _workingSet->getFlagged().end(), - std::back_inserter(_flagged)); - _flaggedIterator = _flagged.begin(); - } + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); - // We're streaming flagged results. - invariant(!_doneReturningFlagged); - if (_flaggedIterator == _flagged.end()) { - _doneReturningFlagged = true; - return PlanStage::IS_EOF; - } - - WorkingSetID idToTest = *_flaggedIterator; - _flaggedIterator++; - - WorkingSetMember* member = _workingSet->get(idToTest); - if (Filter::passes(member, _filter)) { - *out = idToTest; - ++_commonStats.advanced; - return PlanStage::ADVANCED; - } - else { - _workingSet->free(idToTest); - ++_commonStats.needTime; - return PlanStage::NEED_TIME; - } - } - - void KeepMutationsStage::saveState() { - ++_commonStats.yields; - _child->saveState(); + // If we've returned as many results as we're limited to, isEOF will be true. + if (isEOF()) { + return PlanStage::IS_EOF; } - void KeepMutationsStage::restoreState(OperationContext* opCtx) { - ++_commonStats.unyields; - _child->restoreState(opCtx); - } + // Stream child results until the child is all done. + if (!_doneReadingChild) { + StageState status = _child->work(out); + + // Child is still returning results. Pass them through. + if (PlanStage::IS_EOF != status) { + if (PlanStage::ADVANCED == status) { + ++_commonStats.advanced; + } else if (PlanStage::NEED_TIME == status) { + ++_commonStats.needTime; + } else if (PlanStage::NEED_YIELD == status) { + ++_commonStats.needYield; + } - void KeepMutationsStage::invalidate(OperationContext* txn, - const RecordId& dl, - InvalidationType type) { - ++_commonStats.invalidates; - _child->invalidate(txn, dl, type); - } + return status; + } - vector<PlanStage*> KeepMutationsStage::getChildren() const { - vector<PlanStage*> children; - children.push_back(_child.get()); - return children; - } + // Child is EOF. We want to stream flagged results if there are any. + _doneReadingChild = true; - PlanStageStats* KeepMutationsStage::getStats() { - _commonStats.isEOF = isEOF(); - unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_KEEP_MUTATIONS)); - // Takes ownership of the object returned from _child->getStats(). - ret->children.push_back(_child->getStats()); - return ret.release(); + // Read out all of the flagged results from the working set. We can't iterate through + // the working set's flagged result set directly, since it may be modified later if + // further documents are invalidated during a yield. + std::copy(_workingSet->getFlagged().begin(), + _workingSet->getFlagged().end(), + std::back_inserter(_flagged)); + _flaggedIterator = _flagged.begin(); } - const CommonStats* KeepMutationsStage::getCommonStats() const { - return &_commonStats; + // We're streaming flagged results. + invariant(!_doneReturningFlagged); + if (_flaggedIterator == _flagged.end()) { + _doneReturningFlagged = true; + return PlanStage::IS_EOF; } - const SpecificStats* KeepMutationsStage::getSpecificStats() const { - return NULL; + WorkingSetID idToTest = *_flaggedIterator; + _flaggedIterator++; + + WorkingSetMember* member = _workingSet->get(idToTest); + if (Filter::passes(member, _filter)) { + *out = idToTest; + ++_commonStats.advanced; + return PlanStage::ADVANCED; + } else { + _workingSet->free(idToTest); + ++_commonStats.needTime; + return PlanStage::NEED_TIME; } +} + +void KeepMutationsStage::saveState() { + ++_commonStats.yields; + _child->saveState(); +} + +void KeepMutationsStage::restoreState(OperationContext* opCtx) { + ++_commonStats.unyields; + _child->restoreState(opCtx); +} + +void KeepMutationsStage::invalidate(OperationContext* txn, + const RecordId& dl, + InvalidationType type) { + ++_commonStats.invalidates; + _child->invalidate(txn, dl, type); +} + +vector<PlanStage*> KeepMutationsStage::getChildren() const { + vector<PlanStage*> children; + children.push_back(_child.get()); + return children; +} + +PlanStageStats* KeepMutationsStage::getStats() { + _commonStats.isEOF = isEOF(); + unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_KEEP_MUTATIONS)); + // Takes ownership of the object returned from _child->getStats(). + ret->children.push_back(_child->getStats()); + return ret.release(); +} + +const CommonStats* KeepMutationsStage::getCommonStats() const { + return &_commonStats; +} + +const SpecificStats* KeepMutationsStage::getSpecificStats() const { + return NULL; +} } // namespace mongo diff --git a/src/mongo/db/exec/keep_mutations.h b/src/mongo/db/exec/keep_mutations.h index 111d777d894..cbf9f75b928 100644 --- a/src/mongo/db/exec/keep_mutations.h +++ b/src/mongo/db/exec/keep_mutations.h @@ -36,62 +36,64 @@ namespace mongo { - /** - * KeepMutationsStage passes all of its child's data through until the child is EOF. - * It then returns all flagged elements in the WorkingSet that pass the stage's filter. - * - * This stage is used to merge results that are invalidated mid-query back into the query - * results when possible. The query planner is responsible for determining when it's valid to - * merge these results. - */ - class KeepMutationsStage : public PlanStage { - public: - KeepMutationsStage(const MatchExpression* filter, WorkingSet* ws, PlanStage* child); - virtual ~KeepMutationsStage(); +/** + * KeepMutationsStage passes all of its child's data through until the child is EOF. + * It then returns all flagged elements in the WorkingSet that pass the stage's filter. + * + * This stage is used to merge results that are invalidated mid-query back into the query + * results when possible. The query planner is responsible for determining when it's valid to + * merge these results. + */ +class KeepMutationsStage : public PlanStage { +public: + KeepMutationsStage(const MatchExpression* filter, WorkingSet* ws, PlanStage* child); + virtual ~KeepMutationsStage(); - virtual bool isEOF(); - virtual StageState work(WorkingSetID* out); + virtual bool isEOF(); + virtual StageState work(WorkingSetID* out); - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - virtual std::vector<PlanStage*> getChildren() const; + virtual std::vector<PlanStage*> getChildren() const; - virtual StageType stageType() const { return STAGE_KEEP_MUTATIONS; } + virtual StageType stageType() const { + return STAGE_KEEP_MUTATIONS; + } - virtual PlanStageStats* getStats(); + virtual PlanStageStats* getStats(); - virtual const CommonStats* getCommonStats() const; + virtual const CommonStats* getCommonStats() const; - virtual const SpecificStats* getSpecificStats() const; + virtual const SpecificStats* getSpecificStats() const; - static const char* kStageType; + static const char* kStageType; - private: - // Not owned here. - WorkingSet* _workingSet; +private: + // Not owned here. + WorkingSet* _workingSet; - std::unique_ptr<PlanStage> _child; + std::unique_ptr<PlanStage> _child; - // Not owned here. Should be the full query expression tree. - const MatchExpression* _filter; + // Not owned here. Should be the full query expression tree. + const MatchExpression* _filter; - // We read from our child... - bool _doneReadingChild; + // We read from our child... + bool _doneReadingChild; - // ...until it's out of results, at which point we put any flagged results back in the query - // stream. - bool _doneReturningFlagged; + // ...until it's out of results, at which point we put any flagged results back in the query + // stream. + bool _doneReturningFlagged; - // Stats. - CommonStats _commonStats; + // Stats. + CommonStats _commonStats; - // Our copy of the working set's flagged results. - std::vector<WorkingSetID> _flagged; + // Our copy of the working set's flagged results. + std::vector<WorkingSetID> _flagged; - // Iterator pointing into _flagged. - std::vector<WorkingSetID>::const_iterator _flaggedIterator; - }; + // Iterator pointing into _flagged. + std::vector<WorkingSetID>::const_iterator _flaggedIterator; +}; } // namespace mongo diff --git a/src/mongo/db/exec/limit.cpp b/src/mongo/db/exec/limit.cpp index c766d07c650..a62f6e863e3 100644 --- a/src/mongo/db/exec/limit.cpp +++ b/src/mongo/db/exec/limit.cpp @@ -34,103 +34,99 @@ namespace mongo { - using std::unique_ptr; - using std::vector; - - // static - const char* LimitStage::kStageType = "LIMIT"; - - LimitStage::LimitStage(int limit, WorkingSet* ws, PlanStage* child) - : _ws(ws), - _child(child), - _numToReturn(limit), - _commonStats(kStageType) { - _specificStats.limit = _numToReturn; - } +using std::unique_ptr; +using std::vector; - LimitStage::~LimitStage() { } +// static +const char* LimitStage::kStageType = "LIMIT"; - bool LimitStage::isEOF() { return (0 == _numToReturn) || _child->isEOF(); } +LimitStage::LimitStage(int limit, WorkingSet* ws, PlanStage* child) + : _ws(ws), _child(child), _numToReturn(limit), _commonStats(kStageType) { + _specificStats.limit = _numToReturn; +} - PlanStage::StageState LimitStage::work(WorkingSetID* out) { - ++_commonStats.works; +LimitStage::~LimitStage() {} - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); +bool LimitStage::isEOF() { + return (0 == _numToReturn) || _child->isEOF(); +} - if (0 == _numToReturn) { - // We've returned as many results as we're limited to. - return PlanStage::IS_EOF; - } +PlanStage::StageState LimitStage::work(WorkingSetID* out) { + ++_commonStats.works; - WorkingSetID id = WorkingSet::INVALID_ID; - StageState status = _child->work(&id); + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); - if (PlanStage::ADVANCED == status) { - *out = id; - --_numToReturn; - ++_commonStats.advanced; - return PlanStage::ADVANCED; - } - else if (PlanStage::FAILURE == status || PlanStage::DEAD == status) { - *out = id; - // If a stage fails, it may create a status WSM to indicate why it - // failed, in which case 'id' is valid. If ID is invalid, we - // create our own error message. - if (WorkingSet::INVALID_ID == id) { - mongoutils::str::stream ss; - ss << "limit stage failed to read in results from child"; - Status status(ErrorCodes::InternalError, ss); - *out = WorkingSetCommon::allocateStatusMember( _ws, status); - } - return status; - } - else if (PlanStage::NEED_TIME == status) { - ++_commonStats.needTime; - } - else if (PlanStage::NEED_YIELD == status) { - ++_commonStats.needYield; - *out = id; - } - - return status; + if (0 == _numToReturn) { + // We've returned as many results as we're limited to. + return PlanStage::IS_EOF; } - void LimitStage::saveState() { - ++_commonStats.yields; - _child->saveState(); - } - - void LimitStage::restoreState(OperationContext* opCtx) { - ++_commonStats.unyields; - _child->restoreState(opCtx); - } - - void LimitStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { - ++_commonStats.invalidates; - _child->invalidate(txn, dl, type); - } - - vector<PlanStage*> LimitStage::getChildren() const { - vector<PlanStage*> children; - children.push_back(_child.get()); - return children; - } - - PlanStageStats* LimitStage::getStats() { - _commonStats.isEOF = isEOF(); - unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_LIMIT)); - ret->specific.reset(new LimitStats(_specificStats)); - ret->children.push_back(_child->getStats()); - return ret.release(); - } - - const CommonStats* LimitStage::getCommonStats() const { - return &_commonStats; + WorkingSetID id = WorkingSet::INVALID_ID; + StageState status = _child->work(&id); + + if (PlanStage::ADVANCED == status) { + *out = id; + --_numToReturn; + ++_commonStats.advanced; + return PlanStage::ADVANCED; + } else if (PlanStage::FAILURE == status || PlanStage::DEAD == status) { + *out = id; + // If a stage fails, it may create a status WSM to indicate why it + // failed, in which case 'id' is valid. If ID is invalid, we + // create our own error message. + if (WorkingSet::INVALID_ID == id) { + mongoutils::str::stream ss; + ss << "limit stage failed to read in results from child"; + Status status(ErrorCodes::InternalError, ss); + *out = WorkingSetCommon::allocateStatusMember(_ws, status); + } + return status; + } else if (PlanStage::NEED_TIME == status) { + ++_commonStats.needTime; + } else if (PlanStage::NEED_YIELD == status) { + ++_commonStats.needYield; + *out = id; } - const SpecificStats* LimitStage::getSpecificStats() const { - return &_specificStats; - } + return status; +} + +void LimitStage::saveState() { + ++_commonStats.yields; + _child->saveState(); +} + +void LimitStage::restoreState(OperationContext* opCtx) { + ++_commonStats.unyields; + _child->restoreState(opCtx); +} + +void LimitStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + ++_commonStats.invalidates; + _child->invalidate(txn, dl, type); +} + +vector<PlanStage*> LimitStage::getChildren() const { + vector<PlanStage*> children; + children.push_back(_child.get()); + return children; +} + +PlanStageStats* LimitStage::getStats() { + _commonStats.isEOF = isEOF(); + unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_LIMIT)); + ret->specific.reset(new LimitStats(_specificStats)); + ret->children.push_back(_child->getStats()); + return ret.release(); +} + +const CommonStats* LimitStage::getCommonStats() const { + return &_commonStats; +} + +const SpecificStats* LimitStage::getSpecificStats() const { + return &_specificStats; +} } // namespace mongo diff --git a/src/mongo/db/exec/limit.h b/src/mongo/db/exec/limit.h index f3f722fd1a1..828b6e6c0d4 100644 --- a/src/mongo/db/exec/limit.h +++ b/src/mongo/db/exec/limit.h @@ -35,47 +35,49 @@ namespace mongo { - /** - * This stage implements limit functionality. It only returns 'limit' results before EOF. - * - * Sort has a baked-in limit, as it can optimize the sort if it has a limit. - * - * Preconditions: None. - */ - class LimitStage : public PlanStage { - public: - LimitStage(int limit, WorkingSet* ws, PlanStage* child); - virtual ~LimitStage(); +/** + * This stage implements limit functionality. It only returns 'limit' results before EOF. + * + * Sort has a baked-in limit, as it can optimize the sort if it has a limit. + * + * Preconditions: None. + */ +class LimitStage : public PlanStage { +public: + LimitStage(int limit, WorkingSet* ws, PlanStage* child); + virtual ~LimitStage(); - virtual bool isEOF(); - virtual StageState work(WorkingSetID* out); + virtual bool isEOF(); + virtual StageState work(WorkingSetID* out); - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - virtual std::vector<PlanStage*> getChildren() const; + virtual std::vector<PlanStage*> getChildren() const; - virtual StageType stageType() const { return STAGE_LIMIT; } + virtual StageType stageType() const { + return STAGE_LIMIT; + } - virtual PlanStageStats* getStats(); + virtual PlanStageStats* getStats(); - virtual const CommonStats* getCommonStats() const; + virtual const CommonStats* getCommonStats() const; - virtual const SpecificStats* getSpecificStats() const; + virtual const SpecificStats* getSpecificStats() const; - static const char* kStageType; + static const char* kStageType; - private: - WorkingSet* _ws; - std::unique_ptr<PlanStage> _child; +private: + WorkingSet* _ws; + std::unique_ptr<PlanStage> _child; - // We only return this many results. - int _numToReturn; + // We only return this many results. + int _numToReturn; - // Stats - CommonStats _commonStats; - LimitStats _specificStats; - }; + // Stats + CommonStats _commonStats; + LimitStats _specificStats; +}; } // namespace mongo diff --git a/src/mongo/db/exec/merge_sort.cpp b/src/mongo/db/exec/merge_sort.cpp index 37c8269a502..7f0581da18c 100644 --- a/src/mongo/db/exec/merge_sort.cpp +++ b/src/mongo/db/exec/merge_sort.cpp @@ -35,255 +35,257 @@ namespace mongo { - using std::unique_ptr; - using std::list; - using std::string; - using std::vector; - - // static - const char* MergeSortStage::kStageType = "SORT_MERGE"; - - MergeSortStage::MergeSortStage(const MergeSortStageParams& params, - WorkingSet* ws, - const Collection* collection) - : _collection(collection), - _ws(ws), - _pattern(params.pattern), - _dedup(params.dedup), - _merging(StageWithValueComparison(ws, params.pattern)), - _commonStats(kStageType) { } - - MergeSortStage::~MergeSortStage() { - for (size_t i = 0; i < _children.size(); ++i) { delete _children[i]; } +using std::unique_ptr; +using std::list; +using std::string; +using std::vector; + +// static +const char* MergeSortStage::kStageType = "SORT_MERGE"; + +MergeSortStage::MergeSortStage(const MergeSortStageParams& params, + WorkingSet* ws, + const Collection* collection) + : _collection(collection), + _ws(ws), + _pattern(params.pattern), + _dedup(params.dedup), + _merging(StageWithValueComparison(ws, params.pattern)), + _commonStats(kStageType) {} + +MergeSortStage::~MergeSortStage() { + for (size_t i = 0; i < _children.size(); ++i) { + delete _children[i]; } +} - void MergeSortStage::addChild(PlanStage* child) { - _children.push_back(child); +void MergeSortStage::addChild(PlanStage* child) { + _children.push_back(child); - // We have to call work(...) on every child before we can pick a min. - _noResultToMerge.push(child); - } - - bool MergeSortStage::isEOF() { - // If we have no more results to return, and we have no more children that we can call - // work(...) on to get results, we're done. - return _merging.empty() && _noResultToMerge.empty(); - } + // We have to call work(...) on every child before we can pick a min. + _noResultToMerge.push(child); +} - PlanStage::StageState MergeSortStage::work(WorkingSetID* out) { - ++_commonStats.works; +bool MergeSortStage::isEOF() { + // If we have no more results to return, and we have no more children that we can call + // work(...) on to get results, we're done. + return _merging.empty() && _noResultToMerge.empty(); +} - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); +PlanStage::StageState MergeSortStage::work(WorkingSetID* out) { + ++_commonStats.works; - if (isEOF()) { return PlanStage::IS_EOF; } + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); - if (!_noResultToMerge.empty()) { - // We have some child that we don't have a result from. Each child must have a result - // in order to pick the minimum result among all our children. Work a child. - PlanStage* child = _noResultToMerge.front(); - WorkingSetID id = WorkingSet::INVALID_ID; - StageState code = child->work(&id); - - if (PlanStage::ADVANCED == code) { - // If we're deduping... - if (_dedup) { - WorkingSetMember* member = _ws->get(id); + if (isEOF()) { + return PlanStage::IS_EOF; + } - if (!member->hasLoc()) { - // Can't dedup data unless there's a RecordId. We go ahead and use its - // result. + if (!_noResultToMerge.empty()) { + // We have some child that we don't have a result from. Each child must have a result + // in order to pick the minimum result among all our children. Work a child. + PlanStage* child = _noResultToMerge.front(); + WorkingSetID id = WorkingSet::INVALID_ID; + StageState code = child->work(&id); + + if (PlanStage::ADVANCED == code) { + // If we're deduping... + if (_dedup) { + WorkingSetMember* member = _ws->get(id); + + if (!member->hasLoc()) { + // Can't dedup data unless there's a RecordId. We go ahead and use its + // result. + _noResultToMerge.pop(); + } else { + ++_specificStats.dupsTested; + // ...and there's a diskloc and and we've seen the RecordId before + if (_seen.end() != _seen.find(member->loc)) { + // ...drop it. + _ws->free(id); + ++_commonStats.needTime; + ++_specificStats.dupsDropped; + return PlanStage::NEED_TIME; + } else { + // Otherwise, note that we've seen it. + _seen.insert(member->loc); + // We're going to use the result from the child, so we remove it from + // the queue of children without a result. _noResultToMerge.pop(); } - else { - ++_specificStats.dupsTested; - // ...and there's a diskloc and and we've seen the RecordId before - if (_seen.end() != _seen.find(member->loc)) { - // ...drop it. - _ws->free(id); - ++_commonStats.needTime; - ++_specificStats.dupsDropped; - return PlanStage::NEED_TIME; - } - else { - // Otherwise, note that we've seen it. - _seen.insert(member->loc); - // We're going to use the result from the child, so we remove it from - // the queue of children without a result. - _noResultToMerge.pop(); - } - } - } - else { - // Not deduping. We use any result we get from the child. Remove the child - // from the queue of things without a result. - _noResultToMerge.pop(); } + } else { + // Not deduping. We use any result we get from the child. Remove the child + // from the queue of things without a result. + _noResultToMerge.pop(); + } - // Store the result in our list. - StageWithValue value; - value.id = id; - value.stage = child; - _mergingData.push_front(value); + // Store the result in our list. + StageWithValue value; + value.id = id; + value.stage = child; + _mergingData.push_front(value); - // Insert the result (indirectly) into our priority queue. - _merging.push(_mergingData.begin()); + // Insert the result (indirectly) into our priority queue. + _merging.push(_mergingData.begin()); - ++_commonStats.needTime; - return PlanStage::NEED_TIME; + ++_commonStats.needTime; + return PlanStage::NEED_TIME; + } else if (PlanStage::IS_EOF == code) { + // There are no more results possible from this child. Don't bother with it + // anymore. + _noResultToMerge.pop(); + ++_commonStats.needTime; + return PlanStage::NEED_TIME; + } else if (PlanStage::FAILURE == code || PlanStage::DEAD == code) { + *out = id; + // If a stage fails, it may create a status WSM to indicate why it + // failed, in which case 'id' is valid. If ID is invalid, we + // create our own error message. + if (WorkingSet::INVALID_ID == id) { + mongoutils::str::stream ss; + ss << "merge sort stage failed to read in results from child"; + Status status(ErrorCodes::InternalError, ss); + *out = WorkingSetCommon::allocateStatusMember(_ws, status); } - else if (PlanStage::IS_EOF == code) { - // There are no more results possible from this child. Don't bother with it - // anymore. - _noResultToMerge.pop(); + return code; + } else { + if (PlanStage::NEED_TIME == code) { ++_commonStats.needTime; - return PlanStage::NEED_TIME; - } - else if (PlanStage::FAILURE == code || PlanStage::DEAD == code) { + } else if (PlanStage::NEED_YIELD == code) { *out = id; - // If a stage fails, it may create a status WSM to indicate why it - // failed, in which case 'id' is valid. If ID is invalid, we - // create our own error message. - if (WorkingSet::INVALID_ID == id) { - mongoutils::str::stream ss; - ss << "merge sort stage failed to read in results from child"; - Status status(ErrorCodes::InternalError, ss); - *out = WorkingSetCommon::allocateStatusMember( _ws, status); - } - return code; + ++_commonStats.needYield; } - else { - if (PlanStage::NEED_TIME == code) { - ++_commonStats.needTime; - } - else if (PlanStage::NEED_YIELD == code) { - *out = id; - ++_commonStats.needYield; - } - return code; - } + return code; } + } - // If we're here, for each non-EOF child, we have a valid WSID. - verify(!_merging.empty()); - - // Get the 'min' WSID. _merging is a priority queue so its top is the smallest. - MergingRef top = _merging.top(); - _merging.pop(); + // If we're here, for each non-EOF child, we have a valid WSID. + verify(!_merging.empty()); - // Since we're returning the WSID that came from top->stage, we need to work(...) it again - // to get a new result. - _noResultToMerge.push(top->stage); + // Get the 'min' WSID. _merging is a priority queue so its top is the smallest. + MergingRef top = _merging.top(); + _merging.pop(); - // Save the ID that we're returning and remove the returned result from our data. - WorkingSetID idToTest = top->id; - _mergingData.erase(top); + // Since we're returning the WSID that came from top->stage, we need to work(...) it again + // to get a new result. + _noResultToMerge.push(top->stage); - // Return the min. - *out = idToTest; - ++_commonStats.advanced; + // Save the ID that we're returning and remove the returned result from our data. + WorkingSetID idToTest = top->id; + _mergingData.erase(top); - // But don't return it if it's flagged. - if (_ws->isFlagged(*out)) { - return PlanStage::NEED_TIME; - } + // Return the min. + *out = idToTest; + ++_commonStats.advanced; - return PlanStage::ADVANCED; + // But don't return it if it's flagged. + if (_ws->isFlagged(*out)) { + return PlanStage::NEED_TIME; } - void MergeSortStage::saveState() { - ++_commonStats.yields; - for (size_t i = 0; i < _children.size(); ++i) { - _children[i]->saveState(); - } + return PlanStage::ADVANCED; +} + +void MergeSortStage::saveState() { + ++_commonStats.yields; + for (size_t i = 0; i < _children.size(); ++i) { + _children[i]->saveState(); } +} - void MergeSortStage::restoreState(OperationContext* opCtx) { - ++_commonStats.unyields; - for (size_t i = 0; i < _children.size(); ++i) { - _children[i]->restoreState(opCtx); - } +void MergeSortStage::restoreState(OperationContext* opCtx) { + ++_commonStats.unyields; + for (size_t i = 0; i < _children.size(); ++i) { + _children[i]->restoreState(opCtx); } +} - void MergeSortStage::invalidate(OperationContext* txn, - const RecordId& dl, - InvalidationType type) { - ++_commonStats.invalidates; - for (size_t i = 0; i < _children.size(); ++i) { - _children[i]->invalidate(txn, dl, type); - } +void MergeSortStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + ++_commonStats.invalidates; + for (size_t i = 0; i < _children.size(); ++i) { + _children[i]->invalidate(txn, dl, type); + } - // Go through our data and see if we're holding on to the invalidated loc. - for (list<StageWithValue>::iterator valueIt = _mergingData.begin(); valueIt != _mergingData.end(); valueIt++) { - WorkingSetMember* member = _ws->get(valueIt->id); - if (member->hasLoc() && (dl == member->loc)) { - // Force a fetch and flag. We could possibly merge this result back in later. - WorkingSetCommon::fetchAndInvalidateLoc(txn, member, _collection); - _ws->flagForReview(valueIt->id); - ++_specificStats.forcedFetches; - } + // Go through our data and see if we're holding on to the invalidated loc. + for (list<StageWithValue>::iterator valueIt = _mergingData.begin(); + valueIt != _mergingData.end(); + valueIt++) { + WorkingSetMember* member = _ws->get(valueIt->id); + if (member->hasLoc() && (dl == member->loc)) { + // Force a fetch and flag. We could possibly merge this result back in later. + WorkingSetCommon::fetchAndInvalidateLoc(txn, member, _collection); + _ws->flagForReview(valueIt->id); + ++_specificStats.forcedFetches; } - - // If we see DL again it is not the same record as it once was so we still want to - // return it. - if (_dedup) { _seen.erase(dl); } } - // Is lhs less than rhs? Note that priority_queue is a max heap by default so we invert - // the return from the expected value. - bool MergeSortStage::StageWithValueComparison::operator()( - const MergingRef& lhs, const MergingRef& rhs) { - - WorkingSetMember* lhsMember = _ws->get(lhs->id); - WorkingSetMember* rhsMember = _ws->get(rhs->id); - - BSONObjIterator it(_pattern); - while (it.more()) { - BSONElement patternElt = it.next(); - string fn = patternElt.fieldName(); - - BSONElement lhsElt; - verify(lhsMember->getFieldDotted(fn, &lhsElt)); - - BSONElement rhsElt; - verify(rhsMember->getFieldDotted(fn, &rhsElt)); - - // false means don't compare field name. - int x = lhsElt.woCompare(rhsElt, false); - if (-1 == patternElt.number()) { x = -x; } - if (x != 0) { return x > 0; } + // If we see DL again it is not the same record as it once was so we still want to + // return it. + if (_dedup) { + _seen.erase(dl); + } +} + +// Is lhs less than rhs? Note that priority_queue is a max heap by default so we invert +// the return from the expected value. +bool MergeSortStage::StageWithValueComparison::operator()(const MergingRef& lhs, + const MergingRef& rhs) { + WorkingSetMember* lhsMember = _ws->get(lhs->id); + WorkingSetMember* rhsMember = _ws->get(rhs->id); + + BSONObjIterator it(_pattern); + while (it.more()) { + BSONElement patternElt = it.next(); + string fn = patternElt.fieldName(); + + BSONElement lhsElt; + verify(lhsMember->getFieldDotted(fn, &lhsElt)); + + BSONElement rhsElt; + verify(rhsMember->getFieldDotted(fn, &rhsElt)); + + // false means don't compare field name. + int x = lhsElt.woCompare(rhsElt, false); + if (-1 == patternElt.number()) { + x = -x; + } + if (x != 0) { + return x > 0; } - - // A comparator for use with sort is required to model a strict weak ordering, so - // to satisfy irreflexivity we must return 'false' for elements that we consider - // equivalent under the pattern. - return false; } - vector<PlanStage*> MergeSortStage::getChildren() const { - return _children; - } + // A comparator for use with sort is required to model a strict weak ordering, so + // to satisfy irreflexivity we must return 'false' for elements that we consider + // equivalent under the pattern. + return false; +} - PlanStageStats* MergeSortStage::getStats() { - _commonStats.isEOF = isEOF(); +vector<PlanStage*> MergeSortStage::getChildren() const { + return _children; +} - _specificStats.sortPattern = _pattern; +PlanStageStats* MergeSortStage::getStats() { + _commonStats.isEOF = isEOF(); - unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_SORT_MERGE)); - ret->specific.reset(new MergeSortStats(_specificStats)); - for (size_t i = 0; i < _children.size(); ++i) { - ret->children.push_back(_children[i]->getStats()); - } - return ret.release(); - } + _specificStats.sortPattern = _pattern; - const CommonStats* MergeSortStage::getCommonStats() const { - return &_commonStats; + unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_SORT_MERGE)); + ret->specific.reset(new MergeSortStats(_specificStats)); + for (size_t i = 0; i < _children.size(); ++i) { + ret->children.push_back(_children[i]->getStats()); } + return ret.release(); +} - const SpecificStats* MergeSortStage::getSpecificStats() const { - return &_specificStats; - } +const CommonStats* MergeSortStage::getCommonStats() const { + return &_commonStats; +} + +const SpecificStats* MergeSortStage::getSpecificStats() const { + return &_specificStats; +} } // namespace mongo diff --git a/src/mongo/db/exec/merge_sort.h b/src/mongo/db/exec/merge_sort.h index e8e216e5fe3..7ef6d960013 100644 --- a/src/mongo/db/exec/merge_sort.h +++ b/src/mongo/db/exec/merge_sort.h @@ -39,129 +39,130 @@ namespace mongo { - // External params for the merge sort stage. Declared below. - class MergeSortStageParams; - - /** - * Merges the outputs of N children, each of which is sorted in the order specified by - * 'pattern'. The output is sorted by 'pattern'. Practically speaking, all of this stage's - * children are indices. - * - * AKA the SERVER-1205 stage. Allows very efficient handling of the following query: - * find($or[{a:1}, {b:1}]).sort({c:1}) with indices {a:1, c:1} and {b:1, c:1}. - * - * Preconditions: For each field in 'pattern' all inputs in the child must handle a - * getFieldDotted for that field. - */ - class MergeSortStage : public PlanStage { - public: - MergeSortStage(const MergeSortStageParams& params, - WorkingSet* ws, - const Collection* collection); - virtual ~MergeSortStage(); +// External params for the merge sort stage. Declared below. +class MergeSortStageParams; - void addChild(PlanStage* child); +/** + * Merges the outputs of N children, each of which is sorted in the order specified by + * 'pattern'. The output is sorted by 'pattern'. Practically speaking, all of this stage's + * children are indices. + * + * AKA the SERVER-1205 stage. Allows very efficient handling of the following query: + * find($or[{a:1}, {b:1}]).sort({c:1}) with indices {a:1, c:1} and {b:1, c:1}. + * + * Preconditions: For each field in 'pattern' all inputs in the child must handle a + * getFieldDotted for that field. + */ +class MergeSortStage : public PlanStage { +public: + MergeSortStage(const MergeSortStageParams& params, + WorkingSet* ws, + const Collection* collection); + virtual ~MergeSortStage(); - virtual bool isEOF(); - virtual StageState work(WorkingSetID* out); + void addChild(PlanStage* child); - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + virtual bool isEOF(); + virtual StageState work(WorkingSetID* out); - virtual std::vector<PlanStage*> getChildren() const; + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - virtual StageType stageType() const { return STAGE_SORT_MERGE; } + virtual std::vector<PlanStage*> getChildren() const; - PlanStageStats* getStats(); + virtual StageType stageType() const { + return STAGE_SORT_MERGE; + } - virtual const CommonStats* getCommonStats() const; + PlanStageStats* getStats(); - virtual const SpecificStats* getSpecificStats() const; + virtual const CommonStats* getCommonStats() const; - static const char* kStageType; + virtual const SpecificStats* getSpecificStats() const; - private: - // Not owned by us. - const Collection* _collection; + static const char* kStageType; - // Not owned by us. - WorkingSet* _ws; +private: + // Not owned by us. + const Collection* _collection; - // The pattern that we're sorting by. - BSONObj _pattern; + // Not owned by us. + WorkingSet* _ws; + + // The pattern that we're sorting by. + BSONObj _pattern; + + // Are we deduplicating on RecordId? + bool _dedup; + + // Which RecordIds have we seen? + unordered_set<RecordId, RecordId::Hasher> _seen; + + // Owned by us. All the children we're reading from. + std::vector<PlanStage*> _children; - // Are we deduplicating on RecordId? - bool _dedup; - - // Which RecordIds have we seen? - unordered_set<RecordId, RecordId::Hasher> _seen; - - // Owned by us. All the children we're reading from. - std::vector<PlanStage*> _children; - - // In order to pick the next smallest value, we need each child work(...) until it produces - // a result. This is the queue of children that haven't given us a result yet. - std::queue<PlanStage*> _noResultToMerge; - - // There is some confusing STL wrangling going on below. Here's a guide: - // - // We want to keep a priority_queue of results so we can quickly return the min result. - // - // If we receive an invalidate, we need to iterate over any cached state to see if the - // invalidate is relevant. - // - // We can't iterate over a priority_queue, so we keep the actual cached state in a list and - // have a priority_queue of iterators into that list. - // - // Why an iterator instead of a pointer? We need to be able to use the information in the - // priority_queue to remove the item from the list and quickly. - - struct StageWithValue { - StageWithValue() : id(WorkingSet::INVALID_ID), stage(NULL) { } - WorkingSetID id; - PlanStage* stage; - }; - - // We have a priority queue of these. - typedef std::list<StageWithValue>::iterator MergingRef; - - // The comparison function used in our priority queue. - class StageWithValueComparison { - public: - StageWithValueComparison(WorkingSet* ws, BSONObj pattern) - : _ws(ws), _pattern(pattern) {} - - // Is lhs less than rhs? Note that priority_queue is a max heap by default so we invert - // the return from the expected value. - bool operator()(const MergingRef& lhs, const MergingRef& rhs); - - private: - WorkingSet* _ws; - BSONObj _pattern; - }; - - // The min heap of the results we're returning. - std::priority_queue<MergingRef, std::vector<MergingRef>, StageWithValueComparison> _merging; - - // The data referred to by the _merging queue above. - std::list<StageWithValue> _mergingData; - - // Stats - CommonStats _commonStats; - MergeSortStats _specificStats; + // In order to pick the next smallest value, we need each child work(...) until it produces + // a result. This is the queue of children that haven't given us a result yet. + std::queue<PlanStage*> _noResultToMerge; + + // There is some confusing STL wrangling going on below. Here's a guide: + // + // We want to keep a priority_queue of results so we can quickly return the min result. + // + // If we receive an invalidate, we need to iterate over any cached state to see if the + // invalidate is relevant. + // + // We can't iterate over a priority_queue, so we keep the actual cached state in a list and + // have a priority_queue of iterators into that list. + // + // Why an iterator instead of a pointer? We need to be able to use the information in the + // priority_queue to remove the item from the list and quickly. + + struct StageWithValue { + StageWithValue() : id(WorkingSet::INVALID_ID), stage(NULL) {} + WorkingSetID id; + PlanStage* stage; }; - // Parameters that must be provided to a MergeSortStage - class MergeSortStageParams { + // We have a priority queue of these. + typedef std::list<StageWithValue>::iterator MergingRef; + + // The comparison function used in our priority queue. + class StageWithValueComparison { public: - MergeSortStageParams() : dedup(true) { } + StageWithValueComparison(WorkingSet* ws, BSONObj pattern) : _ws(ws), _pattern(pattern) {} - // How we're sorting. - BSONObj pattern; + // Is lhs less than rhs? Note that priority_queue is a max heap by default so we invert + // the return from the expected value. + bool operator()(const MergingRef& lhs, const MergingRef& rhs); - // Do we deduplicate on RecordId? - bool dedup; + private: + WorkingSet* _ws; + BSONObj _pattern; }; + // The min heap of the results we're returning. + std::priority_queue<MergingRef, std::vector<MergingRef>, StageWithValueComparison> _merging; + + // The data referred to by the _merging queue above. + std::list<StageWithValue> _mergingData; + + // Stats + CommonStats _commonStats; + MergeSortStats _specificStats; +}; + +// Parameters that must be provided to a MergeSortStage +class MergeSortStageParams { +public: + MergeSortStageParams() : dedup(true) {} + + // How we're sorting. + BSONObj pattern; + + // Do we deduplicate on RecordId? + bool dedup; +}; + } // namespace mongo diff --git a/src/mongo/db/exec/multi_iterator.cpp b/src/mongo/db/exec/multi_iterator.cpp index f8aeaac8ca5..fe955cb05af 100644 --- a/src/mongo/db/exec/multi_iterator.cpp +++ b/src/mongo/db/exec/multi_iterator.cpp @@ -36,100 +36,96 @@ namespace mongo { - using std::vector; - - const char* MultiIteratorStage::kStageType = "MULTI_ITERATOR"; - - MultiIteratorStage::MultiIteratorStage(OperationContext* txn, - WorkingSet* ws, - Collection* collection) - : _txn(txn), - _collection(collection), - _ws(ws), - _wsidForFetch(_ws->allocate()) { - // We pre-allocate a WSM and use it to pass up fetch requests. This should never be used - // for anything other than passing up NEED_YIELD. We use the loc and owned obj state, but - // the loc isn't really pointing at any obj. The obj field of the WSM should never be used. - WorkingSetMember* member = _ws->get(_wsidForFetch); - member->state = WorkingSetMember::LOC_AND_OWNED_OBJ; +using std::vector; + +const char* MultiIteratorStage::kStageType = "MULTI_ITERATOR"; + +MultiIteratorStage::MultiIteratorStage(OperationContext* txn, + WorkingSet* ws, + Collection* collection) + : _txn(txn), _collection(collection), _ws(ws), _wsidForFetch(_ws->allocate()) { + // We pre-allocate a WSM and use it to pass up fetch requests. This should never be used + // for anything other than passing up NEED_YIELD. We use the loc and owned obj state, but + // the loc isn't really pointing at any obj. The obj field of the WSM should never be used. + WorkingSetMember* member = _ws->get(_wsidForFetch); + member->state = WorkingSetMember::LOC_AND_OWNED_OBJ; +} + +void MultiIteratorStage::addIterator(std::unique_ptr<RecordCursor> it) { + _iterators.push_back(std::move(it)); +} + +PlanStage::StageState MultiIteratorStage::work(WorkingSetID* out) { + if (_collection == NULL) { + Status status(ErrorCodes::InternalError, "MultiIteratorStage died on null collection"); + *out = WorkingSetCommon::allocateStatusMember(_ws, status); + return PlanStage::DEAD; } - void MultiIteratorStage::addIterator(std::unique_ptr<RecordCursor> it) { - _iterators.push_back(std::move(it)); - } - - PlanStage::StageState MultiIteratorStage::work(WorkingSetID* out) { - if (_collection == NULL) { - Status status(ErrorCodes::InternalError, - "MultiIteratorStage died on null collection"); - *out = WorkingSetCommon::allocateStatusMember(_ws, status); - return PlanStage::DEAD; - } - - boost::optional<Record> record; - try { - while (!_iterators.empty()) { - if (auto fetcher = _iterators.back()->fetcherForNext()) { - // Pass the RecordFetcher off up. - WorkingSetMember* member = _ws->get(_wsidForFetch); - member->setFetcher(fetcher.release()); - *out = _wsidForFetch; - return NEED_YIELD; - } - - record = _iterators.back()->next(); - if (record) break; - _iterators.pop_back(); + boost::optional<Record> record; + try { + while (!_iterators.empty()) { + if (auto fetcher = _iterators.back()->fetcherForNext()) { + // Pass the RecordFetcher off up. + WorkingSetMember* member = _ws->get(_wsidForFetch); + member->setFetcher(fetcher.release()); + *out = _wsidForFetch; + return NEED_YIELD; } - } - catch (const WriteConflictException& wce) { - // If _advance throws a WCE we shouldn't have moved. - invariant(!_iterators.empty()); - *out = WorkingSet::INVALID_ID; - return NEED_YIELD; - } - - if (!record) - return IS_EOF; - - *out = _ws->allocate(); - WorkingSetMember* member = _ws->get(*out); - member->loc = record->id; - member->obj = {_txn->recoveryUnit()->getSnapshotId(), record->data.releaseToBson()}; - member->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; - return PlanStage::ADVANCED; - } - bool MultiIteratorStage::isEOF() { - return _collection == NULL || _iterators.empty(); - } - - void MultiIteratorStage::kill() { - _collection = NULL; - _iterators.clear(); - } - - void MultiIteratorStage::saveState() { - _txn = NULL; - for (size_t i = 0; i < _iterators.size(); i++) { - _iterators[i]->savePositioned(); + record = _iterators.back()->next(); + if (record) + break; + _iterators.pop_back(); } + } catch (const WriteConflictException& wce) { + // If _advance throws a WCE we shouldn't have moved. + invariant(!_iterators.empty()); + *out = WorkingSet::INVALID_ID; + return NEED_YIELD; } - void MultiIteratorStage::restoreState(OperationContext* opCtx) { - invariant(_txn == NULL); - _txn = opCtx; - for (size_t i = 0; i < _iterators.size(); i++) { - if (!_iterators[i]->restore(opCtx)) { - kill(); - } + if (!record) + return IS_EOF; + + *out = _ws->allocate(); + WorkingSetMember* member = _ws->get(*out); + member->loc = record->id; + member->obj = {_txn->recoveryUnit()->getSnapshotId(), record->data.releaseToBson()}; + member->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; + return PlanStage::ADVANCED; +} + +bool MultiIteratorStage::isEOF() { + return _collection == NULL || _iterators.empty(); +} + +void MultiIteratorStage::kill() { + _collection = NULL; + _iterators.clear(); +} + +void MultiIteratorStage::saveState() { + _txn = NULL; + for (size_t i = 0; i < _iterators.size(); i++) { + _iterators[i]->savePositioned(); + } +} + +void MultiIteratorStage::restoreState(OperationContext* opCtx) { + invariant(_txn == NULL); + _txn = opCtx; + for (size_t i = 0; i < _iterators.size(); i++) { + if (!_iterators[i]->restore(opCtx)) { + kill(); } } +} - void MultiIteratorStage::invalidate(OperationContext* txn, - const RecordId& dl, - InvalidationType type) { - switch ( type ) { +void MultiIteratorStage::invalidate(OperationContext* txn, + const RecordId& dl, + InvalidationType type) { + switch (type) { case INVALIDATION_DELETION: for (size_t i = 0; i < _iterators.size(); i++) { _iterators[i]->invalidate(dl); @@ -138,19 +134,19 @@ namespace mongo { case INVALIDATION_MUTATION: // no-op break; - } } +} - vector<PlanStage*> MultiIteratorStage::getChildren() const { - vector<PlanStage*> empty; - return empty; - } +vector<PlanStage*> MultiIteratorStage::getChildren() const { + vector<PlanStage*> empty; + return empty; +} - PlanStageStats* MultiIteratorStage::getStats() { - std::unique_ptr<PlanStageStats> ret(new PlanStageStats(CommonStats(kStageType), - STAGE_MULTI_ITERATOR)); - ret->specific.reset(new CollectionScanStats()); - return ret.release(); - } +PlanStageStats* MultiIteratorStage::getStats() { + std::unique_ptr<PlanStageStats> ret( + new PlanStageStats(CommonStats(kStageType), STAGE_MULTI_ITERATOR)); + ret->specific.reset(new CollectionScanStats()); + return ret.release(); +} -} // namespace mongo +} // namespace mongo diff --git a/src/mongo/db/exec/multi_iterator.h b/src/mongo/db/exec/multi_iterator.h index ac2cf44b007..ada4bc16c9a 100644 --- a/src/mongo/db/exec/multi_iterator.h +++ b/src/mongo/db/exec/multi_iterator.h @@ -38,60 +38,66 @@ namespace mongo { - /** - * Iterates over a collection using multiple underlying RecordCursors. - * - * This is a special stage which is not used automatically by queries. It is intended for - * special commands that work with RecordCursors. For example, it is used by the - * parallelCollectionScan and repairCursor commands - */ - class MultiIteratorStage : public PlanStage { - public: - MultiIteratorStage(OperationContext* txn, WorkingSet* ws, Collection* collection); +/** + * Iterates over a collection using multiple underlying RecordCursors. + * + * This is a special stage which is not used automatically by queries. It is intended for + * special commands that work with RecordCursors. For example, it is used by the + * parallelCollectionScan and repairCursor commands + */ +class MultiIteratorStage : public PlanStage { +public: + MultiIteratorStage(OperationContext* txn, WorkingSet* ws, Collection* collection); - ~MultiIteratorStage() { } + ~MultiIteratorStage() {} - void addIterator(std::unique_ptr<RecordCursor> it); + void addIterator(std::unique_ptr<RecordCursor> it); - virtual PlanStage::StageState work(WorkingSetID* out); + virtual PlanStage::StageState work(WorkingSetID* out); - virtual bool isEOF(); + virtual bool isEOF(); - void kill(); + void kill(); - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - // Returns empty PlanStageStats object - virtual PlanStageStats* getStats(); + // Returns empty PlanStageStats object + virtual PlanStageStats* getStats(); - // Not used. - virtual CommonStats* getCommonStats() const { return NULL; } + // Not used. + virtual CommonStats* getCommonStats() const { + return NULL; + } - // Not used. - virtual SpecificStats* getSpecificStats() const { return NULL; } + // Not used. + virtual SpecificStats* getSpecificStats() const { + return NULL; + } - // Not used. - virtual std::vector<PlanStage*> getChildren() const; + // Not used. + virtual std::vector<PlanStage*> getChildren() const; - // Not used. - virtual StageType stageType() const { return STAGE_MULTI_ITERATOR; } + // Not used. + virtual StageType stageType() const { + return STAGE_MULTI_ITERATOR; + } - static const char* kStageType; + static const char* kStageType; - private: - OperationContext* _txn; - Collection* _collection; - std::vector<std::unique_ptr<RecordCursor>> _iterators; +private: + OperationContext* _txn; + Collection* _collection; + std::vector<std::unique_ptr<RecordCursor>> _iterators; - // Not owned by us. - WorkingSet* _ws; + // Not owned by us. + WorkingSet* _ws; - // We allocate a working set member with this id on construction of the stage. It gets - // used for all fetch requests, changing the RecordId as appropriate. - const WorkingSetID _wsidForFetch; - }; + // We allocate a working set member with this id on construction of the stage. It gets + // used for all fetch requests, changing the RecordId as appropriate. + const WorkingSetID _wsidForFetch; +}; -} // namespace mongo +} // namespace mongo diff --git a/src/mongo/db/exec/multi_plan.cpp b/src/mongo/db/exec/multi_plan.cpp index 8919ada64e7..7f82a8c3b5f 100644 --- a/src/mongo/db/exec/multi_plan.cpp +++ b/src/mongo/db/exec/multi_plan.cpp @@ -51,466 +51,462 @@ namespace mongo { - using std::unique_ptr; - using std::endl; - using std::list; - using std::vector; - - // static - const char* MultiPlanStage::kStageType = "MULTI_PLAN"; - - MultiPlanStage::MultiPlanStage(OperationContext* txn, - const Collection* collection, - CanonicalQuery* cq, - bool shouldCache) - : _txn(txn), - _collection(collection), - _shouldCache(shouldCache), - _query(cq), - _bestPlanIdx(kNoSuchPlan), - _backupPlanIdx(kNoSuchPlan), - _failure(false), - _failureCount(0), - _statusMemberId(WorkingSet::INVALID_ID), - _commonStats(kStageType) { - invariant(_collection); +using std::unique_ptr; +using std::endl; +using std::list; +using std::vector; + +// static +const char* MultiPlanStage::kStageType = "MULTI_PLAN"; + +MultiPlanStage::MultiPlanStage(OperationContext* txn, + const Collection* collection, + CanonicalQuery* cq, + bool shouldCache) + : _txn(txn), + _collection(collection), + _shouldCache(shouldCache), + _query(cq), + _bestPlanIdx(kNoSuchPlan), + _backupPlanIdx(kNoSuchPlan), + _failure(false), + _failureCount(0), + _statusMemberId(WorkingSet::INVALID_ID), + _commonStats(kStageType) { + invariant(_collection); +} + +MultiPlanStage::~MultiPlanStage() { + for (size_t ix = 0; ix < _candidates.size(); ++ix) { + delete _candidates[ix].solution; + delete _candidates[ix].root; } +} - MultiPlanStage::~MultiPlanStage() { - for (size_t ix = 0; ix < _candidates.size(); ++ix) { - delete _candidates[ix].solution; - delete _candidates[ix].root; - } +void MultiPlanStage::addPlan(QuerySolution* solution, PlanStage* root, WorkingSet* ws) { + _candidates.push_back(CandidatePlan(solution, root, ws)); +} + +bool MultiPlanStage::isEOF() { + if (_failure) { + return true; } - void MultiPlanStage::addPlan(QuerySolution* solution, PlanStage* root, - WorkingSet* ws) { - _candidates.push_back(CandidatePlan(solution, root, ws)); + // If _bestPlanIdx hasn't been found, can't be at EOF + if (!bestPlanChosen()) { + return false; } - bool MultiPlanStage::isEOF() { - if (_failure) { return true; } + // We must have returned all our cached results + // and there must be no more results from the best plan. + CandidatePlan& bestPlan = _candidates[_bestPlanIdx]; + return bestPlan.results.empty() && bestPlan.root->isEOF(); +} - // If _bestPlanIdx hasn't been found, can't be at EOF - if (!bestPlanChosen()) { return false; } +PlanStage::StageState MultiPlanStage::work(WorkingSetID* out) { + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); - // We must have returned all our cached results - // and there must be no more results from the best plan. - CandidatePlan& bestPlan = _candidates[_bestPlanIdx]; - return bestPlan.results.empty() && bestPlan.root->isEOF(); + if (_failure) { + *out = _statusMemberId; + return PlanStage::FAILURE; } - PlanStage::StageState MultiPlanStage::work(WorkingSetID* out) { - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); - - if (_failure) { - *out = _statusMemberId; - return PlanStage::FAILURE; - } + CandidatePlan& bestPlan = _candidates[_bestPlanIdx]; - CandidatePlan& bestPlan = _candidates[_bestPlanIdx]; + // Look for an already produced result that provides the data the caller wants. + if (!bestPlan.results.empty()) { + *out = bestPlan.results.front(); + bestPlan.results.pop_front(); + _commonStats.advanced++; + return PlanStage::ADVANCED; + } - // Look for an already produced result that provides the data the caller wants. - if (!bestPlan.results.empty()) { - *out = bestPlan.results.front(); - bestPlan.results.pop_front(); - _commonStats.advanced++; - return PlanStage::ADVANCED; - } + // best plan had no (or has no more) cached results - // best plan had no (or has no more) cached results + StageState state = bestPlan.root->work(out); - StageState state = bestPlan.root->work(out); + if (PlanStage::FAILURE == state && hasBackupPlan()) { + LOG(5) << "Best plan errored out switching to backup\n"; + // Uncache the bad solution if we fall back + // on the backup solution. + // + // XXX: Instead of uncaching we should find a way for the + // cached plan runner to fall back on a different solution + // if the best solution fails. Alternatively we could try to + // defer cache insertion to be after the first produced result. - if (PlanStage::FAILURE == state && hasBackupPlan()) { - LOG(5) << "Best plan errored out switching to backup\n"; - // Uncache the bad solution if we fall back - // on the backup solution. - // - // XXX: Instead of uncaching we should find a way for the - // cached plan runner to fall back on a different solution - // if the best solution fails. Alternatively we could try to - // defer cache insertion to be after the first produced result. + _collection->infoCache()->getPlanCache()->remove(*_query); - _collection->infoCache()->getPlanCache()->remove(*_query); + _bestPlanIdx = _backupPlanIdx; + _backupPlanIdx = kNoSuchPlan; - _bestPlanIdx = _backupPlanIdx; - _backupPlanIdx = kNoSuchPlan; + return _candidates[_bestPlanIdx].root->work(out); + } - return _candidates[_bestPlanIdx].root->work(out); - } + if (hasBackupPlan() && PlanStage::ADVANCED == state) { + LOG(5) << "Best plan had a blocking stage, became unblocked\n"; + _backupPlanIdx = kNoSuchPlan; + } - if (hasBackupPlan() && PlanStage::ADVANCED == state) { - LOG(5) << "Best plan had a blocking stage, became unblocked\n"; - _backupPlanIdx = kNoSuchPlan; - } + // Increment stats. + if (PlanStage::ADVANCED == state) { + _commonStats.advanced++; + } else if (PlanStage::NEED_TIME == state) { + _commonStats.needTime++; + } else if (PlanStage::NEED_YIELD == state) { + _commonStats.needYield++; + } - // Increment stats. - if (PlanStage::ADVANCED == state) { - _commonStats.advanced++; - } - else if (PlanStage::NEED_TIME == state) { - _commonStats.needTime++; - } - else if (PlanStage::NEED_YIELD == state) { - _commonStats.needYield++; + return state; +} + +Status MultiPlanStage::tryYield(PlanYieldPolicy* yieldPolicy) { + // These are the conditions which can cause us to yield: + // 1) The yield policy's timer elapsed, or + // 2) some stage requested a yield due to a document fetch, or + // 3) we need to yield and retry due to a WriteConflictException. + // In all cases, the actual yielding happens here. + if (yieldPolicy->shouldYield()) { + bool alive = yieldPolicy->yield(_fetcher.get()); + + if (!alive) { + _failure = true; + Status failStat(ErrorCodes::OperationFailed, + "PlanExecutor killed during plan selection"); + _statusMemberId = WorkingSetCommon::allocateStatusMember(_candidates[0].ws, failStat); + return failStat; } + } - return state; + // We're done using the fetcher, so it should be freed. We don't want to + // use the same RecordFetcher twice. + _fetcher.reset(); + + return Status::OK(); +} + +// static +size_t MultiPlanStage::getTrialPeriodWorks(OperationContext* txn, const Collection* collection) { + // Run each plan some number of times. This number is at least as great as + // 'internalQueryPlanEvaluationWorks', but may be larger for big collections. + size_t numWorks = internalQueryPlanEvaluationWorks; + if (NULL != collection) { + // For large collections, the number of works is set to be this + // fraction of the collection size. + double fraction = internalQueryPlanEvaluationCollFraction; + + numWorks = std::max(static_cast<size_t>(internalQueryPlanEvaluationWorks), + static_cast<size_t>(fraction * collection->numRecords(txn))); } - Status MultiPlanStage::tryYield(PlanYieldPolicy* yieldPolicy) { - // These are the conditions which can cause us to yield: - // 1) The yield policy's timer elapsed, or - // 2) some stage requested a yield due to a document fetch, or - // 3) we need to yield and retry due to a WriteConflictException. - // In all cases, the actual yielding happens here. - if (yieldPolicy->shouldYield()) { - bool alive = yieldPolicy->yield(_fetcher.get()); + return numWorks; +} + +// static +size_t MultiPlanStage::getTrialPeriodNumToReturn(const CanonicalQuery& query) { + // Determine the number of results which we will produce during the plan + // ranking phase before stopping. + size_t numResults = static_cast<size_t>(internalQueryPlanEvaluationMaxResults); + if (query.getParsed().getLimit()) { + numResults = std::min(static_cast<size_t>(*query.getParsed().getLimit()), numResults); + } else if (!query.getParsed().isFromFindCommand() && query.getParsed().getBatchSize()) { + numResults = std::min(static_cast<size_t>(*query.getParsed().getBatchSize()), numResults); + } - if (!alive) { - _failure = true; - Status failStat(ErrorCodes::OperationFailed, - "PlanExecutor killed during plan selection"); - _statusMemberId = WorkingSetCommon::allocateStatusMember(_candidates[0].ws, - failStat); - return failStat; - } - } + return numResults; +} - // We're done using the fetcher, so it should be freed. We don't want to - // use the same RecordFetcher twice. - _fetcher.reset(); +Status MultiPlanStage::pickBestPlan(PlanYieldPolicy* yieldPolicy) { + // Adds the amount of time taken by pickBestPlan() to executionTimeMillis. There's lots of + // execution work that happens here, so this is needed for the time accounting to + // make sense. + ScopedTimer timer(&_commonStats.executionTimeMillis); - return Status::OK(); - } + size_t numWorks = getTrialPeriodWorks(_txn, _collection); + size_t numResults = getTrialPeriodNumToReturn(*_query); - // static - size_t MultiPlanStage::getTrialPeriodWorks(OperationContext* txn, - const Collection* collection) { - // Run each plan some number of times. This number is at least as great as - // 'internalQueryPlanEvaluationWorks', but may be larger for big collections. - size_t numWorks = internalQueryPlanEvaluationWorks; - if (NULL != collection) { - // For large collections, the number of works is set to be this - // fraction of the collection size. - double fraction = internalQueryPlanEvaluationCollFraction; - - numWorks = std::max(static_cast<size_t>(internalQueryPlanEvaluationWorks), - static_cast<size_t>(fraction * collection->numRecords(txn))); + // Work the plans, stopping when a plan hits EOF or returns some + // fixed number of results. + for (size_t ix = 0; ix < numWorks; ++ix) { + bool moreToDo = workAllPlans(numResults, yieldPolicy); + if (!moreToDo) { + break; } + } - return numWorks; + if (_failure) { + invariant(WorkingSet::INVALID_ID != _statusMemberId); + WorkingSetMember* member = _candidates[0].ws->get(_statusMemberId); + return WorkingSetCommon::getMemberStatus(*member); } - // static - size_t MultiPlanStage::getTrialPeriodNumToReturn(const CanonicalQuery& query) { - // Determine the number of results which we will produce during the plan - // ranking phase before stopping. - size_t numResults = static_cast<size_t>(internalQueryPlanEvaluationMaxResults); - if (query.getParsed().getLimit()) { - numResults = std::min(static_cast<size_t>(*query.getParsed().getLimit()), - numResults); - } - else if (!query.getParsed().isFromFindCommand() && query.getParsed().getBatchSize()) { - numResults = std::min(static_cast<size_t>(*query.getParsed().getBatchSize()), - numResults); - } + // After picking best plan, ranking will own plan stats from + // candidate solutions (winner and losers). + std::unique_ptr<PlanRankingDecision> ranking(new PlanRankingDecision); + _bestPlanIdx = PlanRanker::pickBestPlan(_candidates, ranking.get()); + verify(_bestPlanIdx >= 0 && _bestPlanIdx < static_cast<int>(_candidates.size())); - return numResults; - } + // Copy candidate order. We will need this to sort candidate stats for explain + // after transferring ownership of 'ranking' to plan cache. + std::vector<size_t> candidateOrder = ranking->candidateOrder; - Status MultiPlanStage::pickBestPlan(PlanYieldPolicy* yieldPolicy) { - // Adds the amount of time taken by pickBestPlan() to executionTimeMillis. There's lots of - // execution work that happens here, so this is needed for the time accounting to - // make sense. - ScopedTimer timer(&_commonStats.executionTimeMillis); + CandidatePlan& bestCandidate = _candidates[_bestPlanIdx]; + std::list<WorkingSetID>& alreadyProduced = bestCandidate.results; + QuerySolution* bestSolution = bestCandidate.solution; - size_t numWorks = getTrialPeriodWorks(_txn, _collection); - size_t numResults = getTrialPeriodNumToReturn(*_query); + LOG(5) << "Winning solution:\n" << bestSolution->toString() << endl; + LOG(2) << "Winning plan: " << Explain::getPlanSummary(bestCandidate.root); - // Work the plans, stopping when a plan hits EOF or returns some - // fixed number of results. - for (size_t ix = 0; ix < numWorks; ++ix) { - bool moreToDo = workAllPlans(numResults, yieldPolicy); - if (!moreToDo) { break; } + _backupPlanIdx = kNoSuchPlan; + if (bestSolution->hasBlockingStage && (0 == alreadyProduced.size())) { + LOG(5) << "Winner has blocking stage, looking for backup plan...\n"; + for (size_t ix = 0; ix < _candidates.size(); ++ix) { + if (!_candidates[ix].solution->hasBlockingStage) { + LOG(5) << "Candidate " << ix << " is backup child\n"; + _backupPlanIdx = ix; + break; + } } + } - if (_failure) { - invariant(WorkingSet::INVALID_ID != _statusMemberId); - WorkingSetMember* member = _candidates[0].ws->get(_statusMemberId); - return WorkingSetCommon::getMemberStatus(*member); + // Store the choice we just made in the cache, if the query is of a type that is safe to + // cache. + if (PlanCache::shouldCacheQuery(*_query) && _shouldCache) { + // Create list of candidate solutions for the cache with + // the best solution at the front. + std::vector<QuerySolution*> solutions; + + // Generate solutions and ranking decisions sorted by score. + for (size_t orderingIndex = 0; orderingIndex < candidateOrder.size(); ++orderingIndex) { + // index into candidates/ranking + size_t ix = candidateOrder[orderingIndex]; + solutions.push_back(_candidates[ix].solution); } - // After picking best plan, ranking will own plan stats from - // candidate solutions (winner and losers). - std::unique_ptr<PlanRankingDecision> ranking(new PlanRankingDecision); - _bestPlanIdx = PlanRanker::pickBestPlan(_candidates, ranking.get()); - verify(_bestPlanIdx >= 0 && _bestPlanIdx < static_cast<int>(_candidates.size())); - - // Copy candidate order. We will need this to sort candidate stats for explain - // after transferring ownership of 'ranking' to plan cache. - std::vector<size_t> candidateOrder = ranking->candidateOrder; - - CandidatePlan& bestCandidate = _candidates[_bestPlanIdx]; - std::list<WorkingSetID>& alreadyProduced = bestCandidate.results; - QuerySolution* bestSolution = bestCandidate.solution; - - LOG(5) << "Winning solution:\n" << bestSolution->toString() << endl; - LOG(2) << "Winning plan: " << Explain::getPlanSummary(bestCandidate.root); - - _backupPlanIdx = kNoSuchPlan; - if (bestSolution->hasBlockingStage && (0 == alreadyProduced.size())) { - LOG(5) << "Winner has blocking stage, looking for backup plan...\n"; - for (size_t ix = 0; ix < _candidates.size(); ++ix) { - if (!_candidates[ix].solution->hasBlockingStage) { - LOG(5) << "Candidate " << ix << " is backup child\n"; - _backupPlanIdx = ix; - break; - } + // Check solution cache data. Do not add to cache if + // we have any invalid SolutionCacheData data. + // XXX: One known example is 2D queries + bool validSolutions = true; + for (size_t ix = 0; ix < solutions.size(); ++ix) { + if (NULL == solutions[ix]->cacheData.get()) { + LOG(5) << "Not caching query because this solution has no cache data: " + << solutions[ix]->toString(); + validSolutions = false; + break; } } - // Store the choice we just made in the cache, if the query is of a type that is safe to - // cache. - if (PlanCache::shouldCacheQuery(*_query) && _shouldCache) { - // Create list of candidate solutions for the cache with - // the best solution at the front. - std::vector<QuerySolution*> solutions; - - // Generate solutions and ranking decisions sorted by score. - for (size_t orderingIndex = 0; - orderingIndex < candidateOrder.size(); ++orderingIndex) { - // index into candidates/ranking - size_t ix = candidateOrder[orderingIndex]; - solutions.push_back(_candidates[ix].solution); - } + if (validSolutions) { + _collection->infoCache()->getPlanCache()->add(*_query, solutions, ranking.release()); + } + } - // Check solution cache data. Do not add to cache if - // we have any invalid SolutionCacheData data. - // XXX: One known example is 2D queries - bool validSolutions = true; - for (size_t ix = 0; ix < solutions.size(); ++ix) { - if (NULL == solutions[ix]->cacheData.get()) { - LOG(5) << "Not caching query because this solution has no cache data: " - << solutions[ix]->toString(); - validSolutions = false; - break; - } - } + return Status::OK(); +} - if (validSolutions) { - _collection->infoCache()->getPlanCache()->add(*_query, solutions, ranking.release()); - } +vector<PlanStageStats*> MultiPlanStage::generateCandidateStats() { + OwnedPointerVector<PlanStageStats> candidateStats; + + for (size_t ix = 0; ix < _candidates.size(); ix++) { + if (ix == (size_t)_bestPlanIdx) { + continue; + } + if (ix == (size_t)_backupPlanIdx) { + continue; } - return Status::OK(); + PlanStageStats* stats = _candidates[ix].root->getStats(); + candidateStats.push_back(stats); } - vector<PlanStageStats*> MultiPlanStage::generateCandidateStats() { - OwnedPointerVector<PlanStageStats> candidateStats; + return candidateStats.release(); +} - for (size_t ix = 0; ix < _candidates.size(); ix++) { - if (ix == (size_t)_bestPlanIdx) { continue; } - if (ix == (size_t)_backupPlanIdx) { continue; } +bool MultiPlanStage::workAllPlans(size_t numResults, PlanYieldPolicy* yieldPolicy) { + bool doneWorking = false; - PlanStageStats* stats = _candidates[ix].root->getStats(); - candidateStats.push_back(stats); + for (size_t ix = 0; ix < _candidates.size(); ++ix) { + CandidatePlan& candidate = _candidates[ix]; + if (candidate.failed) { + continue; } - return candidateStats.release(); - } + // Might need to yield between calls to work due to the timer elapsing. + if (!(tryYield(yieldPolicy)).isOK()) { + return false; + } - bool MultiPlanStage::workAllPlans(size_t numResults, PlanYieldPolicy* yieldPolicy) { - bool doneWorking = false; + WorkingSetID id = WorkingSet::INVALID_ID; + PlanStage::StageState state = candidate.root->work(&id); - for (size_t ix = 0; ix < _candidates.size(); ++ix) { - CandidatePlan& candidate = _candidates[ix]; - if (candidate.failed) { continue; } + if (PlanStage::ADVANCED == state) { + // Save result for later. + candidate.results.push_back(id); + + // Once a plan returns enough results, stop working. + if (candidate.results.size() >= numResults) { + doneWorking = true; + } + } else if (PlanStage::IS_EOF == state) { + // First plan to hit EOF wins automatically. Stop evaluating other plans. + // Assumes that the ranking will pick this plan. + doneWorking = true; + } else if (PlanStage::NEED_YIELD == state) { + if (id == WorkingSet::INVALID_ID) { + if (!yieldPolicy->allowedToYield()) + throw WriteConflictException(); + } else { + WorkingSetMember* member = candidate.ws->get(id); + invariant(member->hasFetcher()); + // Transfer ownership of the fetcher and yield. + _fetcher.reset(member->releaseFetcher()); + } + + if (yieldPolicy->allowedToYield()) { + yieldPolicy->forceYield(); + } - // Might need to yield between calls to work due to the timer elapsing. if (!(tryYield(yieldPolicy)).isOK()) { return false; } + } else if (PlanStage::NEED_TIME != state) { + // FAILURE or DEAD. Do we want to just tank that plan and try the rest? We + // probably want to fail globally as this shouldn't happen anyway. - WorkingSetID id = WorkingSet::INVALID_ID; - PlanStage::StageState state = candidate.root->work(&id); - - if (PlanStage::ADVANCED == state) { - // Save result for later. - candidate.results.push_back(id); + candidate.failed = true; + ++_failureCount; - // Once a plan returns enough results, stop working. - if (candidate.results.size() >= numResults) { - doneWorking = true; - } + // Propagate most recent seen failure to parent. + if (PlanStage::FAILURE == state) { + _statusMemberId = id; } - else if (PlanStage::IS_EOF == state) { - // First plan to hit EOF wins automatically. Stop evaluating other plans. - // Assumes that the ranking will pick this plan. - doneWorking = true; - } - else if (PlanStage::NEED_YIELD == state) { - if (id == WorkingSet::INVALID_ID) { - if (!yieldPolicy->allowedToYield()) - throw WriteConflictException(); - } - else { - WorkingSetMember* member = candidate.ws->get(id); - invariant(member->hasFetcher()); - // Transfer ownership of the fetcher and yield. - _fetcher.reset(member->releaseFetcher()); - } - - if (yieldPolicy->allowedToYield()) { - yieldPolicy->forceYield(); - } - - if (!(tryYield(yieldPolicy)).isOK()) { - return false; - } - } - else if (PlanStage::NEED_TIME != state) { - // FAILURE or DEAD. Do we want to just tank that plan and try the rest? We - // probably want to fail globally as this shouldn't happen anyway. - - candidate.failed = true; - ++_failureCount; - - // Propagate most recent seen failure to parent. - if (PlanStage::FAILURE == state) { - _statusMemberId = id; - } - - if (_failureCount == _candidates.size()) { - _failure = true; - return false; - } + + if (_failureCount == _candidates.size()) { + _failure = true; + return false; } } - - return !doneWorking; } - void MultiPlanStage::saveState() { - _txn = NULL; - for (size_t i = 0; i < _candidates.size(); ++i) { - _candidates[i].root->saveState(); - } + return !doneWorking; +} + +void MultiPlanStage::saveState() { + _txn = NULL; + for (size_t i = 0; i < _candidates.size(); ++i) { + _candidates[i].root->saveState(); } +} - void MultiPlanStage::restoreState(OperationContext* opCtx) { - invariant(_txn == NULL); - _txn = opCtx; +void MultiPlanStage::restoreState(OperationContext* opCtx) { + invariant(_txn == NULL); + _txn = opCtx; - for (size_t i = 0; i < _candidates.size(); ++i) { - _candidates[i].root->restoreState(opCtx); + for (size_t i = 0; i < _candidates.size(); ++i) { + _candidates[i].root->restoreState(opCtx); + } +} + +namespace { + +void invalidateHelper(OperationContext* txn, + WorkingSet* ws, // may flag for review + const RecordId& dl, + list<WorkingSetID>* idsToInvalidate, + const Collection* collection) { + for (list<WorkingSetID>::iterator it = idsToInvalidate->begin(); + it != idsToInvalidate->end();) { + WorkingSetMember* member = ws->get(*it); + if (member->hasLoc() && member->loc == dl) { + list<WorkingSetID>::iterator next = it; + next++; + WorkingSetCommon::fetchAndInvalidateLoc(txn, member, collection); + ws->flagForReview(*it); + idsToInvalidate->erase(it); + it = next; + } else { + it++; } } +} +} - namespace { - - void invalidateHelper(OperationContext* txn, - WorkingSet* ws, // may flag for review - const RecordId& dl, - list<WorkingSetID>* idsToInvalidate, - const Collection* collection) { - for (list<WorkingSetID>::iterator it = idsToInvalidate->begin(); - it != idsToInvalidate->end();) { - WorkingSetMember* member = ws->get(*it); - if (member->hasLoc() && member->loc == dl) { - list<WorkingSetID>::iterator next = it; - next++; - WorkingSetCommon::fetchAndInvalidateLoc(txn, member, collection); - ws->flagForReview(*it); - idsToInvalidate->erase(it); - it = next; - } - else { - it++; - } - } - } +void MultiPlanStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + if (_failure) { + return; } - void MultiPlanStage::invalidate(OperationContext* txn, - const RecordId& dl, - InvalidationType type) { - if (_failure) { return; } - - if (bestPlanChosen()) { - CandidatePlan& bestPlan = _candidates[_bestPlanIdx]; - bestPlan.root->invalidate(txn, dl, type); - invalidateHelper(txn, bestPlan.ws, dl, &bestPlan.results, _collection); - if (hasBackupPlan()) { - CandidatePlan& backupPlan = _candidates[_backupPlanIdx]; - backupPlan.root->invalidate(txn, dl, type); - invalidateHelper(txn, backupPlan.ws, dl, &backupPlan.results, _collection); - } + if (bestPlanChosen()) { + CandidatePlan& bestPlan = _candidates[_bestPlanIdx]; + bestPlan.root->invalidate(txn, dl, type); + invalidateHelper(txn, bestPlan.ws, dl, &bestPlan.results, _collection); + if (hasBackupPlan()) { + CandidatePlan& backupPlan = _candidates[_backupPlanIdx]; + backupPlan.root->invalidate(txn, dl, type); + invalidateHelper(txn, backupPlan.ws, dl, &backupPlan.results, _collection); } - else { - for (size_t ix = 0; ix < _candidates.size(); ++ix) { - _candidates[ix].root->invalidate(txn, dl, type); - invalidateHelper(txn, _candidates[ix].ws, dl, &_candidates[ix].results, _collection); - } + } else { + for (size_t ix = 0; ix < _candidates.size(); ++ix) { + _candidates[ix].root->invalidate(txn, dl, type); + invalidateHelper(txn, _candidates[ix].ws, dl, &_candidates[ix].results, _collection); } } +} - bool MultiPlanStage::hasBackupPlan() const { - return kNoSuchPlan != _backupPlanIdx; - } +bool MultiPlanStage::hasBackupPlan() const { + return kNoSuchPlan != _backupPlanIdx; +} - bool MultiPlanStage::bestPlanChosen() const { - return kNoSuchPlan != _bestPlanIdx; - } +bool MultiPlanStage::bestPlanChosen() const { + return kNoSuchPlan != _bestPlanIdx; +} - int MultiPlanStage::bestPlanIdx() const { - return _bestPlanIdx; - } +int MultiPlanStage::bestPlanIdx() const { + return _bestPlanIdx; +} - QuerySolution* MultiPlanStage::bestSolution() { - if (_bestPlanIdx == kNoSuchPlan) - return NULL; +QuerySolution* MultiPlanStage::bestSolution() { + if (_bestPlanIdx == kNoSuchPlan) + return NULL; - return _candidates[_bestPlanIdx].solution; - } + return _candidates[_bestPlanIdx].solution; +} - vector<PlanStage*> MultiPlanStage::getChildren() const { - vector<PlanStage*> children; +vector<PlanStage*> MultiPlanStage::getChildren() const { + vector<PlanStage*> children; - if (bestPlanChosen()) { - children.push_back(_candidates[_bestPlanIdx].root); - } - else { - for (size_t i = 0; i < _candidates.size(); i++) { - children.push_back(_candidates[i].root); - } + if (bestPlanChosen()) { + children.push_back(_candidates[_bestPlanIdx].root); + } else { + for (size_t i = 0; i < _candidates.size(); i++) { + children.push_back(_candidates[i].root); } - - return children; } - PlanStageStats* MultiPlanStage::getStats() { - if (bestPlanChosen()) { - return _candidates[_bestPlanIdx].root->getStats(); - } - if (hasBackupPlan()) { - return _candidates[_backupPlanIdx].root->getStats(); - } - _commonStats.isEOF = isEOF(); - - unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_MULTI_PLAN)); + return children; +} - return ret.release(); +PlanStageStats* MultiPlanStage::getStats() { + if (bestPlanChosen()) { + return _candidates[_bestPlanIdx].root->getStats(); } - - const CommonStats* MultiPlanStage::getCommonStats() const { - return &_commonStats; + if (hasBackupPlan()) { + return _candidates[_backupPlanIdx].root->getStats(); } + _commonStats.isEOF = isEOF(); - const SpecificStats* MultiPlanStage::getSpecificStats() const { - return &_specificStats; - } + unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_MULTI_PLAN)); + + return ret.release(); +} + +const CommonStats* MultiPlanStage::getCommonStats() const { + return &_commonStats; +} + +const SpecificStats* MultiPlanStage::getSpecificStats() const { + return &_specificStats; +} } // namespace mongo diff --git a/src/mongo/db/exec/multi_plan.h b/src/mongo/db/exec/multi_plan.h index 189190163ca..28030fb8d34 100644 --- a/src/mongo/db/exec/multi_plan.h +++ b/src/mongo/db/exec/multi_plan.h @@ -41,192 +41,194 @@ namespace mongo { +/** + * This stage outputs its mainChild, and possibly it's backup child + * and also updates the cache. + * + * Preconditions: Valid RecordId. + * + * Owns the query solutions and PlanStage roots for all candidate plans. + */ +class MultiPlanStage : public PlanStage { +public: + /** + * Takes no ownership. + * + * If 'shouldCache' is true, writes a cache entry for the winning plan to the plan cache + * when possible. If 'shouldCache' is false, the plan cache will never be written. + */ + MultiPlanStage(OperationContext* txn, + const Collection* collection, + CanonicalQuery* cq, + bool shouldCache = true); + + virtual ~MultiPlanStage(); + + virtual bool isEOF(); + + virtual StageState work(WorkingSetID* out); + + virtual void saveState(); + + virtual void restoreState(OperationContext* opCtx); + + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + + virtual std::vector<PlanStage*> getChildren() const; + + virtual StageType stageType() const { + return STAGE_MULTI_PLAN; + } + + virtual PlanStageStats* getStats(); + + virtual const CommonStats* getCommonStats() const; + + virtual const SpecificStats* getSpecificStats() const; + + /** + * Takes ownership of QuerySolution and PlanStage. not of WorkingSet + */ + void addPlan(QuerySolution* solution, PlanStage* root, WorkingSet* sharedWs); + + /** + * Runs all plans added by addPlan, ranks them, and picks a best. + * All further calls to work(...) will return results from the best plan. + * + * If 'yieldPolicy' is non-NULL, then all locks may be yielded in between round-robin + * works of the candidate plans. By default, 'yieldPolicy' is NULL and no yielding will + * take place. + * + * Returns a non-OK status if the plan was killed during yield. + */ + Status pickBestPlan(PlanYieldPolicy* yieldPolicy); + + /** + * Returns the number of times that we are willing to work a plan during a trial period. + * + * Calculated based on a fixed query knob and the size of the collection. + */ + static size_t getTrialPeriodWorks(OperationContext* txn, const Collection* collection); + + /** + * Returns the max number of documents which we should allow any plan to return during the + * trial period. As soon as any plan hits this number of documents, the trial period ends. + */ + static size_t getTrialPeriodNumToReturn(const CanonicalQuery& query); + + /** Return true if a best plan has been chosen */ + bool bestPlanChosen() const; + + /** Return the index of the best plan chosen, for testing */ + int bestPlanIdx() const; + + /** + * Returns the QuerySolution for the best plan, or NULL if no best plan + * + * The MultiPlanStage retains ownership of the winning QuerySolution and returns an + * unowned pointer. + */ + QuerySolution* bestSolution(); + /** - * This stage outputs its mainChild, and possibly it's backup child - * and also updates the cache. + * Returns true if a backup plan was picked. + * This is the case when the best plan has a blocking stage. + * Exposed for testing. + */ + bool hasBackupPlan() const; + + // + // Used by explain. + // + + /** + * Gathers execution stats for all losing plans. Caller takes ownership of + * all pointers in the returned vector. + */ + std::vector<PlanStageStats*> generateCandidateStats(); + + static const char* kStageType; + +private: + // + // Have all our candidate plans do something. + // If all our candidate plans fail, *objOut will contain + // information on the failure. + // + + /** + * Calls work on each child plan in a round-robin fashion. We stop when any plan hits EOF + * or returns 'numResults' results. * - * Preconditions: Valid RecordId. + * Returns true if we need to keep working the plans and false otherwise. + */ + bool workAllPlans(size_t numResults, PlanYieldPolicy* yieldPolicy); + + /** + * Checks whether we need to perform either a timing-based yield or a yield for a document + * fetch. If so, then uses 'yieldPolicy' to actually perform the yield. * - * Owns the query solutions and PlanStage roots for all candidate plans. + * Returns a non-OK status if killed during a yield. */ - class MultiPlanStage : public PlanStage { - public: - /** - * Takes no ownership. - * - * If 'shouldCache' is true, writes a cache entry for the winning plan to the plan cache - * when possible. If 'shouldCache' is false, the plan cache will never be written. - */ - MultiPlanStage(OperationContext* txn, - const Collection* collection, - CanonicalQuery* cq, - bool shouldCache = true); - - virtual ~MultiPlanStage(); - - virtual bool isEOF(); - - virtual StageState work(WorkingSetID* out); - - virtual void saveState(); - - virtual void restoreState(OperationContext* opCtx); - - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - - virtual std::vector<PlanStage*> getChildren() const; - - virtual StageType stageType() const { return STAGE_MULTI_PLAN; } - - virtual PlanStageStats* getStats(); - - virtual const CommonStats* getCommonStats() const; - - virtual const SpecificStats* getSpecificStats() const; - - /** - * Takes ownership of QuerySolution and PlanStage. not of WorkingSet - */ - void addPlan(QuerySolution* solution, PlanStage* root, WorkingSet* sharedWs); - - /** - * Runs all plans added by addPlan, ranks them, and picks a best. - * All further calls to work(...) will return results from the best plan. - * - * If 'yieldPolicy' is non-NULL, then all locks may be yielded in between round-robin - * works of the candidate plans. By default, 'yieldPolicy' is NULL and no yielding will - * take place. - * - * Returns a non-OK status if the plan was killed during yield. - */ - Status pickBestPlan(PlanYieldPolicy* yieldPolicy); - - /** - * Returns the number of times that we are willing to work a plan during a trial period. - * - * Calculated based on a fixed query knob and the size of the collection. - */ - static size_t getTrialPeriodWorks(OperationContext* txn, const Collection* collection); - - /** - * Returns the max number of documents which we should allow any plan to return during the - * trial period. As soon as any plan hits this number of documents, the trial period ends. - */ - static size_t getTrialPeriodNumToReturn(const CanonicalQuery& query); - - /** Return true if a best plan has been chosen */ - bool bestPlanChosen() const; - - /** Return the index of the best plan chosen, for testing */ - int bestPlanIdx() const; - - /** - * Returns the QuerySolution for the best plan, or NULL if no best plan - * - * The MultiPlanStage retains ownership of the winning QuerySolution and returns an - * unowned pointer. - */ - QuerySolution* bestSolution(); - - /** - * Returns true if a backup plan was picked. - * This is the case when the best plan has a blocking stage. - * Exposed for testing. - */ - bool hasBackupPlan() const; - - // - // Used by explain. - // - - /** - * Gathers execution stats for all losing plans. Caller takes ownership of - * all pointers in the returned vector. - */ - std::vector<PlanStageStats*> generateCandidateStats(); - - static const char* kStageType; - - private: - // - // Have all our candidate plans do something. - // If all our candidate plans fail, *objOut will contain - // information on the failure. - // - - /** - * Calls work on each child plan in a round-robin fashion. We stop when any plan hits EOF - * or returns 'numResults' results. - * - * Returns true if we need to keep working the plans and false otherwise. - */ - bool workAllPlans(size_t numResults, PlanYieldPolicy* yieldPolicy); - - /** - * Checks whether we need to perform either a timing-based yield or a yield for a document - * fetch. If so, then uses 'yieldPolicy' to actually perform the yield. - * - * Returns a non-OK status if killed during a yield. - */ - Status tryYield(PlanYieldPolicy* yieldPolicy); - - static const int kNoSuchPlan = -1; - - // Not owned here. - OperationContext* _txn; - - // Not owned here. Must be non-null. - const Collection* _collection; - - // Whether or not we should try to cache the winning plan in the plan cache. - const bool _shouldCache; - - // The query that we're trying to figure out the best solution to. - // not owned here - CanonicalQuery* _query; - - // Candidate plans. Each candidate includes a child PlanStage tree and QuerySolution which - // are owned here. Ownership of all QuerySolutions is retained here, and will *not* be - // tranferred to the PlanExecutor that wraps this stage. - std::vector<CandidatePlan> _candidates; - - // index into _candidates, of the winner of the plan competition - // uses -1 / kNoSuchPlan when best plan is not (yet) known - int _bestPlanIdx; - - // index into _candidates, of the backup plan for sort - // uses -1 / kNoSuchPlan when best plan is not (yet) known - int _backupPlanIdx; - - // Set if this MultiPlanStage cannot continue, and the query must fail. This can happen in - // two ways. The first is that all candidate plans fail. Note that one plan can fail - // during normal execution of the plan competition. Here is an example: - // - // Plan 1: collection scan with sort. Sort runs out of memory. - // Plan 2: ixscan that provides sort. Won't run out of memory. - // - // We want to choose plan 2 even if plan 1 fails. - // - // The second way for failure to occur is that the execution of this query is killed during - // a yield, by some concurrent event such as a collection drop. - bool _failure; - - // If everything fails during the plan competition, we can't pick one. - size_t _failureCount; - - // if pickBestPlan fails, this is set to the wsid of the statusMember - // returned by ::work() - WorkingSetID _statusMemberId; - - // When a stage requests a yield for document fetch, it gives us back a RecordFetcher* - // to use to pull the record into memory. We take ownership of the RecordFetcher here, - // deleting it after we've had a chance to do the fetch. For timing-based yields, we - // just pass a NULL fetcher. - std::unique_ptr<RecordFetcher> _fetcher; - - // Stats - CommonStats _commonStats; - MultiPlanStats _specificStats; - }; + Status tryYield(PlanYieldPolicy* yieldPolicy); + + static const int kNoSuchPlan = -1; + + // Not owned here. + OperationContext* _txn; + + // Not owned here. Must be non-null. + const Collection* _collection; + + // Whether or not we should try to cache the winning plan in the plan cache. + const bool _shouldCache; + + // The query that we're trying to figure out the best solution to. + // not owned here + CanonicalQuery* _query; + + // Candidate plans. Each candidate includes a child PlanStage tree and QuerySolution which + // are owned here. Ownership of all QuerySolutions is retained here, and will *not* be + // tranferred to the PlanExecutor that wraps this stage. + std::vector<CandidatePlan> _candidates; + + // index into _candidates, of the winner of the plan competition + // uses -1 / kNoSuchPlan when best plan is not (yet) known + int _bestPlanIdx; + + // index into _candidates, of the backup plan for sort + // uses -1 / kNoSuchPlan when best plan is not (yet) known + int _backupPlanIdx; + + // Set if this MultiPlanStage cannot continue, and the query must fail. This can happen in + // two ways. The first is that all candidate plans fail. Note that one plan can fail + // during normal execution of the plan competition. Here is an example: + // + // Plan 1: collection scan with sort. Sort runs out of memory. + // Plan 2: ixscan that provides sort. Won't run out of memory. + // + // We want to choose plan 2 even if plan 1 fails. + // + // The second way for failure to occur is that the execution of this query is killed during + // a yield, by some concurrent event such as a collection drop. + bool _failure; + + // If everything fails during the plan competition, we can't pick one. + size_t _failureCount; + + // if pickBestPlan fails, this is set to the wsid of the statusMember + // returned by ::work() + WorkingSetID _statusMemberId; + + // When a stage requests a yield for document fetch, it gives us back a RecordFetcher* + // to use to pull the record into memory. We take ownership of the RecordFetcher here, + // deleting it after we've had a chance to do the fetch. For timing-based yields, we + // just pass a NULL fetcher. + std::unique_ptr<RecordFetcher> _fetcher; + + // Stats + CommonStats _commonStats; + MultiPlanStats _specificStats; +}; } // namespace mongo diff --git a/src/mongo/db/exec/near.cpp b/src/mongo/db/exec/near.cpp index f7c1c9035d3..515120d86a6 100644 --- a/src/mongo/db/exec/near.cpp +++ b/src/mongo/db/exec/near.cpp @@ -36,360 +36,334 @@ namespace mongo { - using std::vector; - - NearStage::NearStage(OperationContext* txn, - WorkingSet* workingSet, - Collection* collection, - PlanStageStats* stats) - : _txn(txn), - _workingSet(workingSet), - _collection(collection), - _searchState(SearchState_Initializing), - _stats(stats), - _nextInterval(NULL) { - - // Ensure we have specific distance search stats unless a child class specified their - // own distance stats subclass - if (!_stats->specific) { - _stats->specific.reset(new NearStats); - } +using std::vector; + +NearStage::NearStage(OperationContext* txn, + WorkingSet* workingSet, + Collection* collection, + PlanStageStats* stats) + : _txn(txn), + _workingSet(workingSet), + _collection(collection), + _searchState(SearchState_Initializing), + _stats(stats), + _nextInterval(NULL) { + // Ensure we have specific distance search stats unless a child class specified their + // own distance stats subclass + if (!_stats->specific) { + _stats->specific.reset(new NearStats); } - - NearStage::~NearStage() { +} + +NearStage::~NearStage() {} + +NearStage::CoveredInterval::CoveredInterval(PlanStage* covering, + bool dedupCovering, + double minDistance, + double maxDistance, + bool inclusiveMax) + : covering(covering), + dedupCovering(dedupCovering), + minDistance(minDistance), + maxDistance(maxDistance), + inclusiveMax(inclusiveMax) {} + + +PlanStage::StageState NearStage::initNext(WorkingSetID* out) { + PlanStage::StageState state = initialize(_txn, _workingSet, _collection, out); + if (state == PlanStage::IS_EOF) { + _searchState = SearchState_Buffering; + return PlanStage::NEED_TIME; } - NearStage::CoveredInterval::CoveredInterval(PlanStage* covering, - bool dedupCovering, - double minDistance, - double maxDistance, - bool inclusiveMax) : - covering(covering), - dedupCovering(dedupCovering), - minDistance(minDistance), - maxDistance(maxDistance), - inclusiveMax(inclusiveMax) { - } + invariant(state != PlanStage::ADVANCED); + // Propagate NEED_TIME or errors upward. + return state; +} - PlanStage::StageState NearStage::initNext(WorkingSetID* out) { - PlanStage::StageState state = initialize(_txn, _workingSet, _collection, out); - if (state == PlanStage::IS_EOF) { - _searchState = SearchState_Buffering; - return PlanStage::NEED_TIME; - } +PlanStage::StageState NearStage::work(WorkingSetID* out) { + ++_stats->common.works; - invariant(state != PlanStage::ADVANCED); + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_stats->common.executionTimeMillis); - // Propagate NEED_TIME or errors upward. - return state; - } + WorkingSetID toReturn = WorkingSet::INVALID_ID; + Status error = Status::OK(); + PlanStage::StageState nextState = PlanStage::NEED_TIME; - PlanStage::StageState NearStage::work(WorkingSetID* out) { + // + // Work the search + // - ++_stats->common.works; + if (SearchState_Initializing == _searchState) { + nextState = initNext(&toReturn); + } else if (SearchState_Buffering == _searchState) { + nextState = bufferNext(&toReturn, &error); + } else if (SearchState_Advancing == _searchState) { + nextState = advanceNext(&toReturn); + } else { + invariant(SearchState_Finished == _searchState); + nextState = PlanStage::IS_EOF; + } - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_stats->common.executionTimeMillis); + // + // Handle the results + // + + if (PlanStage::FAILURE == nextState) { + *out = WorkingSetCommon::allocateStatusMember(_workingSet, error); + } else if (PlanStage::ADVANCED == nextState) { + *out = toReturn; + ++_stats->common.advanced; + } else if (PlanStage::NEED_YIELD == nextState) { + *out = toReturn; + ++_stats->common.needYield; + } else if (PlanStage::NEED_TIME == nextState) { + ++_stats->common.needTime; + } else if (PlanStage::IS_EOF == nextState) { + _stats->common.isEOF = true; + } - WorkingSetID toReturn = WorkingSet::INVALID_ID; - Status error = Status::OK(); - PlanStage::StageState nextState = PlanStage::NEED_TIME; + return nextState; +} - // - // Work the search - // +/** + * Holds a generic search result with a distance computed in some fashion. + */ +struct NearStage::SearchResult { + SearchResult(WorkingSetID resultID, double distance) : resultID(resultID), distance(distance) {} - if (SearchState_Initializing == _searchState) { - nextState = initNext(&toReturn); - } - else if (SearchState_Buffering == _searchState) { - nextState = bufferNext(&toReturn, &error); - } - else if (SearchState_Advancing == _searchState) { - nextState = advanceNext(&toReturn); - } - else { - invariant(SearchState_Finished == _searchState); - nextState = PlanStage::IS_EOF; - } + bool operator<(const SearchResult& other) const { + // We want increasing distance, not decreasing, so we reverse the < + return distance > other.distance; + } - // - // Handle the results - // + WorkingSetID resultID; + double distance; +}; - if (PlanStage::FAILURE == nextState) { - *out = WorkingSetCommon::allocateStatusMember(_workingSet, error); - } - else if (PlanStage::ADVANCED == nextState) { - *out = toReturn; - ++_stats->common.advanced; - } - else if (PlanStage::NEED_YIELD == nextState) { - *out = toReturn; - ++_stats->common.needYield; - } - else if (PlanStage::NEED_TIME == nextState) { - ++_stats->common.needTime; +// Set "toReturn" when NEED_YIELD. +PlanStage::StageState NearStage::bufferNext(WorkingSetID* toReturn, Status* error) { + // + // Try to retrieve the next covered member + // + + if (!_nextInterval) { + StatusWith<CoveredInterval*> intervalStatus = nextInterval(_txn, _workingSet, _collection); + if (!intervalStatus.isOK()) { + _searchState = SearchState_Finished; + *error = intervalStatus.getStatus(); + return PlanStage::FAILURE; } - else if (PlanStage::IS_EOF == nextState) { - _stats->common.isEOF = true; + + if (NULL == intervalStatus.getValue()) { + _searchState = SearchState_Finished; + return PlanStage::IS_EOF; } - return nextState; + // CoveredInterval and its child stage are owned by _childrenIntervals + _childrenIntervals.push_back(intervalStatus.getValue()); + _nextInterval = _childrenIntervals.back(); + _nextIntervalStats.reset(new IntervalStats()); + _nextIntervalStats->minDistanceAllowed = _nextInterval->minDistance; + _nextIntervalStats->maxDistanceAllowed = _nextInterval->maxDistance; + _nextIntervalStats->inclusiveMaxDistanceAllowed = _nextInterval->inclusiveMax; } - /** - * Holds a generic search result with a distance computed in some fashion. - */ - struct NearStage::SearchResult { - - SearchResult(WorkingSetID resultID, double distance) : - resultID(resultID), distance(distance) { - } + WorkingSetID nextMemberID; + PlanStage::StageState intervalState = _nextInterval->covering->work(&nextMemberID); - bool operator<(const SearchResult& other) const { - // We want increasing distance, not decreasing, so we reverse the < - return distance > other.distance; - } + if (PlanStage::IS_EOF == intervalState) { + getNearStats()->intervalStats.push_back(*_nextIntervalStats); + _nextIntervalStats.reset(); + _nextInterval = NULL; + _searchState = SearchState_Advancing; + return PlanStage::NEED_TIME; + } else if (PlanStage::FAILURE == intervalState) { + *error = WorkingSetCommon::getMemberStatus(*_workingSet->get(nextMemberID)); + return intervalState; + } else if (PlanStage::NEED_YIELD == intervalState) { + *toReturn = nextMemberID; + return intervalState; + } else if (PlanStage::ADVANCED != intervalState) { + return intervalState; + } - WorkingSetID resultID; - double distance; - }; - - // Set "toReturn" when NEED_YIELD. - PlanStage::StageState NearStage::bufferNext(WorkingSetID* toReturn, Status* error) { - - // - // Try to retrieve the next covered member - // - - if (!_nextInterval) { - - StatusWith<CoveredInterval*> intervalStatus = nextInterval(_txn, - _workingSet, - _collection); - if (!intervalStatus.isOK()) { - _searchState = SearchState_Finished; - *error = intervalStatus.getStatus(); - return PlanStage::FAILURE; - } - - if (NULL == intervalStatus.getValue()) { - _searchState = SearchState_Finished; - return PlanStage::IS_EOF; - } - - // CoveredInterval and its child stage are owned by _childrenIntervals - _childrenIntervals.push_back(intervalStatus.getValue()); - _nextInterval = _childrenIntervals.back(); - _nextIntervalStats.reset(new IntervalStats()); - _nextIntervalStats->minDistanceAllowed = _nextInterval->minDistance; - _nextIntervalStats->maxDistanceAllowed = _nextInterval->maxDistance; - _nextIntervalStats->inclusiveMaxDistanceAllowed = _nextInterval->inclusiveMax; - } + // + // Try to buffer the next covered member + // - WorkingSetID nextMemberID; - PlanStage::StageState intervalState = _nextInterval->covering->work(&nextMemberID); + WorkingSetMember* nextMember = _workingSet->get(nextMemberID); - if (PlanStage::IS_EOF == intervalState) { - getNearStats()->intervalStats.push_back(*_nextIntervalStats); - _nextIntervalStats.reset(); - _nextInterval = NULL; - _searchState = SearchState_Advancing; + // The child stage may not dedup so we must dedup them ourselves. + if (_nextInterval->dedupCovering && nextMember->hasLoc()) { + if (_nextIntervalSeen.end() != _nextIntervalSeen.find(nextMember->loc)) { + _workingSet->free(nextMemberID); return PlanStage::NEED_TIME; } - else if (PlanStage::FAILURE == intervalState) { - *error = WorkingSetCommon::getMemberStatus(*_workingSet->get(nextMemberID)); - return intervalState; - } - else if (PlanStage::NEED_YIELD == intervalState) { - *toReturn = nextMemberID; - return intervalState; - } - else if (PlanStage::ADVANCED != intervalState) { - return intervalState; - } - - // - // Try to buffer the next covered member - // + } - WorkingSetMember* nextMember = _workingSet->get(nextMemberID); + ++_nextIntervalStats->numResultsFound; - // The child stage may not dedup so we must dedup them ourselves. - if (_nextInterval->dedupCovering && nextMember->hasLoc()) { - if (_nextIntervalSeen.end() != _nextIntervalSeen.find(nextMember->loc)) { - _workingSet->free(nextMemberID); - return PlanStage::NEED_TIME; - } - } + StatusWith<double> distanceStatus = computeDistance(nextMember); - ++_nextIntervalStats->numResultsFound; + if (!distanceStatus.isOK()) { + _searchState = SearchState_Finished; + *error = distanceStatus.getStatus(); + return PlanStage::FAILURE; + } - StatusWith<double> distanceStatus = computeDistance(nextMember); + // If the member's distance is in the current distance interval, add it to our buffered + // results. + double memberDistance = distanceStatus.getValue(); + bool inInterval = memberDistance >= _nextInterval->minDistance && + (_nextInterval->inclusiveMax ? memberDistance <= _nextInterval->maxDistance + : memberDistance < _nextInterval->maxDistance); + + // Update found distance stats + if (_nextIntervalStats->minDistanceFound < 0 || + memberDistance < _nextIntervalStats->minDistanceFound) { + _nextIntervalStats->minDistanceFound = memberDistance; + } - if (!distanceStatus.isOK()) { - _searchState = SearchState_Finished; - *error = distanceStatus.getStatus(); - return PlanStage::FAILURE; - } + if (_nextIntervalStats->maxDistanceFound < 0 || + memberDistance > _nextIntervalStats->maxDistanceFound) { + _nextIntervalStats->maxDistanceFound = memberDistance; + } - // If the member's distance is in the current distance interval, add it to our buffered - // results. - double memberDistance = distanceStatus.getValue(); - bool inInterval = memberDistance >= _nextInterval->minDistance - && (_nextInterval->inclusiveMax ? - memberDistance <= _nextInterval->maxDistance : - memberDistance < _nextInterval->maxDistance); - - // Update found distance stats - if (_nextIntervalStats->minDistanceFound < 0 - || memberDistance < _nextIntervalStats->minDistanceFound) { - _nextIntervalStats->minDistanceFound = memberDistance; - } + if (inInterval) { + _resultBuffer.push(SearchResult(nextMemberID, memberDistance)); - if (_nextIntervalStats->maxDistanceFound < 0 - || memberDistance > _nextIntervalStats->maxDistanceFound) { - _nextIntervalStats->maxDistanceFound = memberDistance; + // Store the member's RecordId, if available, for quick invalidation + if (nextMember->hasLoc()) { + _nextIntervalSeen.insert(std::make_pair(nextMember->loc, nextMemberID)); } - if (inInterval) { - _resultBuffer.push(SearchResult(nextMemberID, memberDistance)); - - // Store the member's RecordId, if available, for quick invalidation - if (nextMember->hasLoc()) { - _nextIntervalSeen.insert(std::make_pair(nextMember->loc, nextMemberID)); - } + ++_nextIntervalStats->numResultsBuffered; - ++_nextIntervalStats->numResultsBuffered; - - // Update buffered distance stats - if (_nextIntervalStats->minDistanceBuffered < 0 - || memberDistance < _nextIntervalStats->minDistanceBuffered) { - _nextIntervalStats->minDistanceBuffered = memberDistance; - } - - if (_nextIntervalStats->maxDistanceBuffered < 0 - || memberDistance > _nextIntervalStats->maxDistanceBuffered) { - _nextIntervalStats->maxDistanceBuffered = memberDistance; - } - } - else { - _workingSet->free(nextMemberID); + // Update buffered distance stats + if (_nextIntervalStats->minDistanceBuffered < 0 || + memberDistance < _nextIntervalStats->minDistanceBuffered) { + _nextIntervalStats->minDistanceBuffered = memberDistance; } - return PlanStage::NEED_TIME; + if (_nextIntervalStats->maxDistanceBuffered < 0 || + memberDistance > _nextIntervalStats->maxDistanceBuffered) { + _nextIntervalStats->maxDistanceBuffered = memberDistance; + } + } else { + _workingSet->free(nextMemberID); } - PlanStage::StageState NearStage::advanceNext(WorkingSetID* toReturn) { - - if (_resultBuffer.empty()) { - - // We're done returning the documents buffered for this annulus, so we can - // clear out our buffered RecordIds. - _nextIntervalSeen.clear(); - _searchState = SearchState_Buffering; - return PlanStage::NEED_TIME; - } + return PlanStage::NEED_TIME; +} - *toReturn = _resultBuffer.top().resultID; - _resultBuffer.pop(); +PlanStage::StageState NearStage::advanceNext(WorkingSetID* toReturn) { + if (_resultBuffer.empty()) { + // We're done returning the documents buffered for this annulus, so we can + // clear out our buffered RecordIds. + _nextIntervalSeen.clear(); + _searchState = SearchState_Buffering; + return PlanStage::NEED_TIME; + } - // If we're returning something, take it out of our RecordId -> WSID map so that future - // calls to invalidate don't cause us to take action for a RecordId we're done with. - WorkingSetMember* member = _workingSet->get(*toReturn); - if (member->hasLoc()) { - _nextIntervalSeen.erase(member->loc); - } + *toReturn = _resultBuffer.top().resultID; + _resultBuffer.pop(); - return PlanStage::ADVANCED; + // If we're returning something, take it out of our RecordId -> WSID map so that future + // calls to invalidate don't cause us to take action for a RecordId we're done with. + WorkingSetMember* member = _workingSet->get(*toReturn); + if (member->hasLoc()) { + _nextIntervalSeen.erase(member->loc); } - bool NearStage::isEOF() { - return SearchState_Finished == _searchState; - } + return PlanStage::ADVANCED; +} - void NearStage::saveState() { - _txn = NULL; - ++_stats->common.yields; - for (size_t i = 0; i < _childrenIntervals.size(); i++) { - _childrenIntervals[i]->covering->saveState(); - } +bool NearStage::isEOF() { + return SearchState_Finished == _searchState; +} - // Subclass specific saving, e.g. saving the 2d or 2dsphere density estimator. - finishSaveState(); +void NearStage::saveState() { + _txn = NULL; + ++_stats->common.yields; + for (size_t i = 0; i < _childrenIntervals.size(); i++) { + _childrenIntervals[i]->covering->saveState(); } - void NearStage::restoreState(OperationContext* opCtx) { - invariant(_txn == NULL); - _txn = opCtx; - ++_stats->common.unyields; - for (size_t i = 0; i < _childrenIntervals.size(); i++) { - _childrenIntervals[i]->covering->restoreState(opCtx); - } + // Subclass specific saving, e.g. saving the 2d or 2dsphere density estimator. + finishSaveState(); +} - // Subclass specific restoring, e.g. restoring the 2d or 2dsphere density estimator. - finishRestoreState(opCtx); +void NearStage::restoreState(OperationContext* opCtx) { + invariant(_txn == NULL); + _txn = opCtx; + ++_stats->common.unyields; + for (size_t i = 0; i < _childrenIntervals.size(); i++) { + _childrenIntervals[i]->covering->restoreState(opCtx); } - void NearStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { - ++_stats->common.invalidates; - for (size_t i = 0; i < _childrenIntervals.size(); i++) { - _childrenIntervals[i]->covering->invalidate(txn, dl, type); - } - - // If a result is in _resultBuffer and has a RecordId it will be in _nextIntervalSeen as - // well. It's safe to return the result w/o the RecordId, so just fetch the result. - unordered_map<RecordId, WorkingSetID, RecordId::Hasher>::iterator seenIt = _nextIntervalSeen - .find(dl); + // Subclass specific restoring, e.g. restoring the 2d or 2dsphere density estimator. + finishRestoreState(opCtx); +} - if (seenIt != _nextIntervalSeen.end()) { +void NearStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + ++_stats->common.invalidates; + for (size_t i = 0; i < _childrenIntervals.size(); i++) { + _childrenIntervals[i]->covering->invalidate(txn, dl, type); + } - WorkingSetMember* member = _workingSet->get(seenIt->second); - verify(member->hasLoc()); - WorkingSetCommon::fetchAndInvalidateLoc(txn, member, _collection); - verify(!member->hasLoc()); + // If a result is in _resultBuffer and has a RecordId it will be in _nextIntervalSeen as + // well. It's safe to return the result w/o the RecordId, so just fetch the result. + unordered_map<RecordId, WorkingSetID, RecordId::Hasher>::iterator seenIt = + _nextIntervalSeen.find(dl); - // Don't keep it around in the seen map since there's no valid RecordId anymore - _nextIntervalSeen.erase(seenIt); - } + if (seenIt != _nextIntervalSeen.end()) { + WorkingSetMember* member = _workingSet->get(seenIt->second); + verify(member->hasLoc()); + WorkingSetCommon::fetchAndInvalidateLoc(txn, member, _collection); + verify(!member->hasLoc()); - // Subclass specific invalidation, e.g. passing the invalidation to the 2d or 2dsphere - // density estimator. - finishInvalidate(txn, dl, type); + // Don't keep it around in the seen map since there's no valid RecordId anymore + _nextIntervalSeen.erase(seenIt); } - vector<PlanStage*> NearStage::getChildren() const { - vector<PlanStage*> children; - for (size_t i = 0; i < _childrenIntervals.size(); i++) { - children.push_back(_childrenIntervals[i]->covering.get()); - } - return children; - } + // Subclass specific invalidation, e.g. passing the invalidation to the 2d or 2dsphere + // density estimator. + finishInvalidate(txn, dl, type); +} - PlanStageStats* NearStage::getStats() { - PlanStageStats* statsClone = _stats->clone(); - for (size_t i = 0; i < _childrenIntervals.size(); ++i) { - statsClone->children.push_back(_childrenIntervals[i]->covering->getStats()); - } - return statsClone; +vector<PlanStage*> NearStage::getChildren() const { + vector<PlanStage*> children; + for (size_t i = 0; i < _childrenIntervals.size(); i++) { + children.push_back(_childrenIntervals[i]->covering.get()); } + return children; +} - StageType NearStage::stageType() const { - return _stats->stageType; +PlanStageStats* NearStage::getStats() { + PlanStageStats* statsClone = _stats->clone(); + for (size_t i = 0; i < _childrenIntervals.size(); ++i) { + statsClone->children.push_back(_childrenIntervals[i]->covering->getStats()); } + return statsClone; +} - const CommonStats* NearStage::getCommonStats() const { - return &_stats->common; - } +StageType NearStage::stageType() const { + return _stats->stageType; +} - const SpecificStats* NearStage::getSpecificStats() const { - return _stats->specific.get(); - } +const CommonStats* NearStage::getCommonStats() const { + return &_stats->common; +} - NearStats* NearStage::getNearStats() { - return static_cast<NearStats*>(_stats->specific.get()); - } +const SpecificStats* NearStage::getSpecificStats() const { + return _stats->specific.get(); +} + +NearStats* NearStage::getNearStats() { + return static_cast<NearStats*>(_stats->specific.get()); +} -} // namespace mongo +} // namespace mongo diff --git a/src/mongo/db/exec/near.h b/src/mongo/db/exec/near.h index f21758617ac..6468d5fcadb 100644 --- a/src/mongo/db/exec/near.h +++ b/src/mongo/db/exec/near.h @@ -42,194 +42,190 @@ namespace mongo { +/** + * An abstract stage which uses a progressive sort to return results sorted by distance. This + * is useful when we do not have a full ordering computed over the distance metric and don't + * want to generate one. + * + * Child stages need to implement functionality which: + * + * - defines a distance metric + * - iterates through ordered distance intervals, nearest to furthest + * - provides a covering for each distance interval + * + * For example - given a distance search over documents with distances from [0 -> 10], the child + * stage might break up the search into intervals [0->5),[5,7),[7->10]. + * + * Each interval requires a PlanStage which *covers* the interval (returns all results in the + * interval). Results in each interval are buffered fully before being returned to ensure that + * ordering is preserved. + * + * For efficient search, the child stage which covers the distance interval in question should + * not return too many results outside the interval, but correctness only depends on the child + * stage returning all results inside the interval. As an example, a PlanStage which covers the + * interval [0->5) might just be a full collection scan - this will always cover every interval, + * but is slow. If there is an index available, an IndexScan stage might also return all + * documents with distance [0->5) but would be much faster. + * + * Also for efficient search, the intervals should not be too large or too small - though again + * correctness does not depend on interval size. + * + * TODO: Right now the interface allows the nextCovering() to be adaptive, but doesn't allow + * aborting and shrinking a covered range being buffered if we guess wrong. + */ +class NearStage : public PlanStage { +public: + struct CoveredInterval; + + virtual ~NearStage(); + + virtual bool isEOF(); + virtual StageState work(WorkingSetID* out); + + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + + virtual std::vector<PlanStage*> getChildren() const; + + virtual StageType stageType() const; + virtual PlanStageStats* getStats(); + virtual const CommonStats* getCommonStats() const; + virtual const SpecificStats* getSpecificStats() const; + +protected: /** - * An abstract stage which uses a progressive sort to return results sorted by distance. This - * is useful when we do not have a full ordering computed over the distance metric and don't - * want to generate one. - * - * Child stages need to implement functionality which: - * - * - defines a distance metric - * - iterates through ordered distance intervals, nearest to furthest - * - provides a covering for each distance interval - * - * For example - given a distance search over documents with distances from [0 -> 10], the child - * stage might break up the search into intervals [0->5),[5,7),[7->10]. - * - * Each interval requires a PlanStage which *covers* the interval (returns all results in the - * interval). Results in each interval are buffered fully before being returned to ensure that - * ordering is preserved. - * - * For efficient search, the child stage which covers the distance interval in question should - * not return too many results outside the interval, but correctness only depends on the child - * stage returning all results inside the interval. As an example, a PlanStage which covers the - * interval [0->5) might just be a full collection scan - this will always cover every interval, - * but is slow. If there is an index available, an IndexScan stage might also return all - * documents with distance [0->5) but would be much faster. - * - * Also for efficient search, the intervals should not be too large or too small - though again - * correctness does not depend on interval size. + * Subclasses of NearStage must provide basics + a stats object which gets owned here. + * The stats object must have specific stats which are a subclass of NearStats, otherwise + * it's generated automatically. + */ + NearStage(OperationContext* txn, + WorkingSet* workingSet, + Collection* collection, + PlanStageStats* stats); + + /** + * Exposes NearStats for adaptive search, allows additional specific stats in subclasses. + */ + NearStats* getNearStats(); + + // + // Methods implemented for specific search functionality + // + + /** + * Constructs the next covering over the next interval to buffer results from, or NULL + * if the full range has been searched. Use the provided working set as the working + * set for the covering stage if required. * - * TODO: Right now the interface allows the nextCovering() to be adaptive, but doesn't allow - * aborting and shrinking a covered range being buffered if we guess wrong. + * Returns !OK on failure to create next stage. */ - class NearStage : public PlanStage { - public: - - struct CoveredInterval; - - virtual ~NearStage(); - - virtual bool isEOF(); - virtual StageState work(WorkingSetID* out); - - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - - virtual std::vector<PlanStage*> getChildren() const; - - virtual StageType stageType() const; - virtual PlanStageStats* getStats(); - virtual const CommonStats* getCommonStats() const; - virtual const SpecificStats* getSpecificStats() const; - - protected: - - /** - * Subclasses of NearStage must provide basics + a stats object which gets owned here. - * The stats object must have specific stats which are a subclass of NearStats, otherwise - * it's generated automatically. - */ - NearStage(OperationContext* txn, - WorkingSet* workingSet, - Collection* collection, - PlanStageStats* stats); - - /** - * Exposes NearStats for adaptive search, allows additional specific stats in subclasses. - */ - NearStats* getNearStats(); - - // - // Methods implemented for specific search functionality - // - - /** - * Constructs the next covering over the next interval to buffer results from, or NULL - * if the full range has been searched. Use the provided working set as the working - * set for the covering stage if required. - * - * Returns !OK on failure to create next stage. - */ - virtual StatusWith<CoveredInterval*> nextInterval(OperationContext* txn, - WorkingSet* workingSet, - Collection* collection) = 0; - - /** - * Computes the distance value for the given member data, or -1 if the member should not be - * returned in the sorted results. - * - * Returns !OK on invalid member data. - */ - virtual StatusWith<double> computeDistance(WorkingSetMember* member) = 0; - - /* - * Initialize near stage before buffering the data. - * Return IS_EOF if subclass finishes the initialization. - * Return NEED_TIME if we need more time. - * Return errors if an error occurs. - * Can't return ADVANCED. - */ - virtual StageState initialize(OperationContext* txn, - WorkingSet* workingSet, - Collection* collection, - WorkingSetID* out) = 0; - - private: - - // - // Save/restore/invalidate work specific to the search type. - // - - virtual void finishSaveState() = 0; - - virtual void finishRestoreState(OperationContext* txn) = 0; - - virtual void finishInvalidate(OperationContext* txn, - const RecordId& dl, - InvalidationType type) = 0; - - // - // Generic methods for progressive search functionality - // - - StageState initNext(WorkingSetID* out); - StageState bufferNext(WorkingSetID* toReturn, Status* error); - StageState advanceNext(WorkingSetID* toReturn); - - // - // Generic state for progressive near search - // - - // Not owned here - OperationContext* _txn; - // Not owned here - WorkingSet* const _workingSet; - // Not owned here, used for fetching buffered results before invalidation - Collection* const _collection; - - // A progressive search works in stages of buffering and then advancing - enum SearchState { - SearchState_Initializing, - SearchState_Buffering, - SearchState_Advancing, - SearchState_Finished - } _searchState; - - // May need to track disklocs from the child stage to do our own deduping, also to do - // invalidation of buffered results. - unordered_map<RecordId, WorkingSetID, RecordId::Hasher> _nextIntervalSeen; - - // Stats for the stage covering this interval - std::unique_ptr<IntervalStats> _nextIntervalStats; - - // Sorted buffered results to be returned - the current interval - struct SearchResult; - std::priority_queue<SearchResult> _resultBuffer; - - // Stats - std::unique_ptr<PlanStageStats> _stats; - - // The current stage from which this stage should buffer results - // Pointer to the last interval in _childrenIntervals. Owned by _childrenIntervals. - CoveredInterval* _nextInterval; - - // All children CoveredIntervals and the sub-stages owned by them. - // - // All children intervals except the last active one are only used by getStats(), - // because they are all EOF. - OwnedPointerVector<CoveredInterval> _childrenIntervals; - }; + virtual StatusWith<CoveredInterval*> nextInterval(OperationContext* txn, + WorkingSet* workingSet, + Collection* collection) = 0; /** - * A covered interval over which a portion of a near search can be run. + * Computes the distance value for the given member data, or -1 if the member should not be + * returned in the sorted results. + * + * Returns !OK on invalid member data. */ - struct NearStage::CoveredInterval { - - CoveredInterval(PlanStage* covering, - bool dedupCovering, - double minDistance, - double maxDistance, - bool inclusiveMax); - - // Owned by NearStage - std::unique_ptr<PlanStage> const covering; - const bool dedupCovering; - - const double minDistance; - const double maxDistance; - const bool inclusiveMax; - }; + virtual StatusWith<double> computeDistance(WorkingSetMember* member) = 0; + + /* + * Initialize near stage before buffering the data. + * Return IS_EOF if subclass finishes the initialization. + * Return NEED_TIME if we need more time. + * Return errors if an error occurs. + * Can't return ADVANCED. + */ + virtual StageState initialize(OperationContext* txn, + WorkingSet* workingSet, + Collection* collection, + WorkingSetID* out) = 0; + +private: + // + // Save/restore/invalidate work specific to the search type. + // + + virtual void finishSaveState() = 0; + + virtual void finishRestoreState(OperationContext* txn) = 0; + + virtual void finishInvalidate(OperationContext* txn, + const RecordId& dl, + InvalidationType type) = 0; + + // + // Generic methods for progressive search functionality + // + + StageState initNext(WorkingSetID* out); + StageState bufferNext(WorkingSetID* toReturn, Status* error); + StageState advanceNext(WorkingSetID* toReturn); + + // + // Generic state for progressive near search + // + + // Not owned here + OperationContext* _txn; + // Not owned here + WorkingSet* const _workingSet; + // Not owned here, used for fetching buffered results before invalidation + Collection* const _collection; + + // A progressive search works in stages of buffering and then advancing + enum SearchState { + SearchState_Initializing, + SearchState_Buffering, + SearchState_Advancing, + SearchState_Finished + } _searchState; + + // May need to track disklocs from the child stage to do our own deduping, also to do + // invalidation of buffered results. + unordered_map<RecordId, WorkingSetID, RecordId::Hasher> _nextIntervalSeen; + + // Stats for the stage covering this interval + std::unique_ptr<IntervalStats> _nextIntervalStats; + + // Sorted buffered results to be returned - the current interval + struct SearchResult; + std::priority_queue<SearchResult> _resultBuffer; + + // Stats + std::unique_ptr<PlanStageStats> _stats; + + // The current stage from which this stage should buffer results + // Pointer to the last interval in _childrenIntervals. Owned by _childrenIntervals. + CoveredInterval* _nextInterval; + + // All children CoveredIntervals and the sub-stages owned by them. + // + // All children intervals except the last active one are only used by getStats(), + // because they are all EOF. + OwnedPointerVector<CoveredInterval> _childrenIntervals; +}; + +/** + * A covered interval over which a portion of a near search can be run. + */ +struct NearStage::CoveredInterval { + CoveredInterval(PlanStage* covering, + bool dedupCovering, + double minDistance, + double maxDistance, + bool inclusiveMax); + + // Owned by NearStage + std::unique_ptr<PlanStage> const covering; + const bool dedupCovering; + + const double minDistance; + const double maxDistance; + const bool inclusiveMax; +}; } // namespace mongo diff --git a/src/mongo/db/exec/oplogstart.cpp b/src/mongo/db/exec/oplogstart.cpp index 92de52db505..d05ddfc2f44 100644 --- a/src/mongo/db/exec/oplogstart.cpp +++ b/src/mongo/db/exec/oplogstart.cpp @@ -35,187 +35,192 @@ namespace mongo { - using std::vector; - - const char* OplogStart::kStageType = "OPLOG_START"; - - // Does not take ownership. - OplogStart::OplogStart(OperationContext* txn, - const Collection* collection, - MatchExpression* filter, - WorkingSet* ws) - : _txn(txn), - _needInit(true), - _backwardsScanning(false), - _extentHopping(false), - _done(false), - _collection(collection), - _workingSet(ws), - _filter(filter) { } - - OplogStart::~OplogStart() { } - - PlanStage::StageState OplogStart::work(WorkingSetID* out) { - // We do our (heavy) init in a work(), where work is expected. - if (_needInit) { - CollectionScanParams params; - params.collection = _collection; - params.direction = CollectionScanParams::BACKWARD; - _cs.reset(new CollectionScan(_txn, params, _workingSet, NULL)); - - _needInit = false; - _backwardsScanning = true; - _timer.reset(); - } - - // If we're still reading backwards, keep trying until timing out. - if (_backwardsScanning) { - verify(!_extentHopping); - // Still have time to succeed with reading backwards. - if (_timer.seconds() < _backwardsScanTime) { - return workBackwardsScan(out); - } - - try { - // If this throws WCE, it leave us in a state were the next call to work will retry. - switchToExtentHopping(); - } - catch (const WriteConflictException& wce) { - _subIterators.clear(); - *out = WorkingSet::INVALID_ID; - return NEED_YIELD; - } - } - - // Don't find it in time? Swing from extent to extent like tarzan.com. - verify(_extentHopping); - return workExtentHopping(out); +using std::vector; + +const char* OplogStart::kStageType = "OPLOG_START"; + +// Does not take ownership. +OplogStart::OplogStart(OperationContext* txn, + const Collection* collection, + MatchExpression* filter, + WorkingSet* ws) + : _txn(txn), + _needInit(true), + _backwardsScanning(false), + _extentHopping(false), + _done(false), + _collection(collection), + _workingSet(ws), + _filter(filter) {} + +OplogStart::~OplogStart() {} + +PlanStage::StageState OplogStart::work(WorkingSetID* out) { + // We do our (heavy) init in a work(), where work is expected. + if (_needInit) { + CollectionScanParams params; + params.collection = _collection; + params.direction = CollectionScanParams::BACKWARD; + _cs.reset(new CollectionScan(_txn, params, _workingSet, NULL)); + + _needInit = false; + _backwardsScanning = true; + _timer.reset(); } - PlanStage::StageState OplogStart::workExtentHopping(WorkingSetID* out) { - if (_done || _subIterators.empty()) { - return PlanStage::IS_EOF; + // If we're still reading backwards, keep trying until timing out. + if (_backwardsScanning) { + verify(!_extentHopping); + // Still have time to succeed with reading backwards. + if (_timer.seconds() < _backwardsScanTime) { + return workBackwardsScan(out); } - // we work from the back to the front since the back has the newest data. try { - // TODO: should we ever check fetcherForNext()? - if (auto record = _subIterators.back()->next()) { - BSONObj obj = record->data.releaseToBson(); - if (!_filter->matchesBSON(obj)) { - _done = true; - WorkingSetID id = _workingSet->allocate(); - WorkingSetMember* member = _workingSet->get(id); - member->loc = record->id; - member->obj = {_txn->recoveryUnit()->getSnapshotId(), std::move(obj)}; - member->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; - *out = id; - return PlanStage::ADVANCED; - } - } - } - catch (const WriteConflictException& wce) { + // If this throws WCE, it leave us in a state were the next call to work will retry. + switchToExtentHopping(); + } catch (const WriteConflictException& wce) { + _subIterators.clear(); *out = WorkingSet::INVALID_ID; - return PlanStage::NEED_YIELD; + return NEED_YIELD; } - - _subIterators.pop_back(); - return PlanStage::NEED_TIME; } - void OplogStart::switchToExtentHopping() { - // Set up our extent hopping state. - _subIterators = _collection->getManyCursors(_txn); + // Don't find it in time? Swing from extent to extent like tarzan.com. + verify(_extentHopping); + return workExtentHopping(out); +} - // Transition from backwards scanning to extent hopping. - _backwardsScanning = false; - _extentHopping = true; +PlanStage::StageState OplogStart::workExtentHopping(WorkingSetID* out) { + if (_done || _subIterators.empty()) { + return PlanStage::IS_EOF; + } - // Toss the collection scan we were using. - _cs.reset(); + // we work from the back to the front since the back has the newest data. + try { + // TODO: should we ever check fetcherForNext()? + if (auto record = _subIterators.back()->next()) { + BSONObj obj = record->data.releaseToBson(); + if (!_filter->matchesBSON(obj)) { + _done = true; + WorkingSetID id = _workingSet->allocate(); + WorkingSetMember* member = _workingSet->get(id); + member->loc = record->id; + member->obj = {_txn->recoveryUnit()->getSnapshotId(), std::move(obj)}; + member->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; + *out = id; + return PlanStage::ADVANCED; + } + } + } catch (const WriteConflictException& wce) { + *out = WorkingSet::INVALID_ID; + return PlanStage::NEED_YIELD; } - PlanStage::StageState OplogStart::workBackwardsScan(WorkingSetID* out) { - PlanStage::StageState state = _cs->work(out); + _subIterators.pop_back(); + return PlanStage::NEED_TIME; +} - // EOF. Just start from the beginning, which is where we've hit. - if (PlanStage::IS_EOF == state) { - _done = true; - return state; - } +void OplogStart::switchToExtentHopping() { + // Set up our extent hopping state. + _subIterators = _collection->getManyCursors(_txn); - if (PlanStage::ADVANCED != state) { return state; } + // Transition from backwards scanning to extent hopping. + _backwardsScanning = false; + _extentHopping = true; - WorkingSetMember* member = _workingSet->get(*out); - verify(member->hasObj()); - verify(member->hasLoc()); + // Toss the collection scan we were using. + _cs.reset(); +} - if (!_filter->matchesBSON(member->obj.value())) { - _done = true; - // RecordId is returned in *out. - return PlanStage::ADVANCED; - } - else { - _workingSet->free(*out); - return PlanStage::NEED_TIME; - } +PlanStage::StageState OplogStart::workBackwardsScan(WorkingSetID* out) { + PlanStage::StageState state = _cs->work(out); + + // EOF. Just start from the beginning, which is where we've hit. + if (PlanStage::IS_EOF == state) { + _done = true; + return state; } - bool OplogStart::isEOF() { return _done; } + if (PlanStage::ADVANCED != state) { + return state; + } - void OplogStart::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { - if (_needInit) { return; } + WorkingSetMember* member = _workingSet->get(*out); + verify(member->hasObj()); + verify(member->hasLoc()); - if (INVALIDATION_DELETION != type) { return; } + if (!_filter->matchesBSON(member->obj.value())) { + _done = true; + // RecordId is returned in *out. + return PlanStage::ADVANCED; + } else { + _workingSet->free(*out); + return PlanStage::NEED_TIME; + } +} - if (_cs) { - _cs->invalidate(txn, dl, type); - } +bool OplogStart::isEOF() { + return _done; +} - for (size_t i = 0; i < _subIterators.size(); i++) { - _subIterators[i]->invalidate(dl); - } +void OplogStart::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + if (_needInit) { + return; } - void OplogStart::saveState() { - _txn = NULL; - if (_cs) { - _cs->saveState(); - } + if (INVALIDATION_DELETION != type) { + return; + } - for (size_t i = 0; i < _subIterators.size(); i++) { - _subIterators[i]->savePositioned(); - } + if (_cs) { + _cs->invalidate(txn, dl, type); } - void OplogStart::restoreState(OperationContext* opCtx) { - invariant(_txn == NULL); - _txn = opCtx; - if (_cs) { - _cs->restoreState(opCtx); - } + for (size_t i = 0; i < _subIterators.size(); i++) { + _subIterators[i]->invalidate(dl); + } +} - for (size_t i = 0; i < _subIterators.size(); i++) { - if (!_subIterators[i]->restore(opCtx)) { - _subIterators.erase(_subIterators.begin() + i); - // need to hit same i on next pass through loop - i--; - } - } +void OplogStart::saveState() { + _txn = NULL; + if (_cs) { + _cs->saveState(); } - PlanStageStats* OplogStart::getStats() { - std::unique_ptr<PlanStageStats> ret(new PlanStageStats(CommonStats(kStageType), - STAGE_OPLOG_START)); - ret->specific.reset(new CollectionScanStats()); - return ret.release(); + for (size_t i = 0; i < _subIterators.size(); i++) { + _subIterators[i]->savePositioned(); } +} - vector<PlanStage*> OplogStart::getChildren() const { - vector<PlanStage*> empty; - return empty; +void OplogStart::restoreState(OperationContext* opCtx) { + invariant(_txn == NULL); + _txn = opCtx; + if (_cs) { + _cs->restoreState(opCtx); } - int OplogStart::_backwardsScanTime = 5; + for (size_t i = 0; i < _subIterators.size(); i++) { + if (!_subIterators[i]->restore(opCtx)) { + _subIterators.erase(_subIterators.begin() + i); + // need to hit same i on next pass through loop + i--; + } + } +} + +PlanStageStats* OplogStart::getStats() { + std::unique_ptr<PlanStageStats> ret( + new PlanStageStats(CommonStats(kStageType), STAGE_OPLOG_START)); + ret->specific.reset(new CollectionScanStats()); + return ret.release(); +} + +vector<PlanStage*> OplogStart::getChildren() const { + vector<PlanStage*> empty; + return empty; +} + +int OplogStart::_backwardsScanTime = 5; } // namespace mongo diff --git a/src/mongo/db/exec/oplogstart.h b/src/mongo/db/exec/oplogstart.h index d7da98f6833..193233a6215 100644 --- a/src/mongo/db/exec/oplogstart.h +++ b/src/mongo/db/exec/oplogstart.h @@ -38,104 +38,116 @@ namespace mongo { - /** - * OplogStart walks a collection backwards to find the first object in the collection that - * matches the query. It's used by replication to efficiently find where the oplog should be - * replayed from. - * - * The oplog is always a capped collection. In capped collections, documents are oriented on - * disk according to insertion order. The oplog inserts documents with increasing timestamps. - * Queries on the oplog look for entries that are after a certain time. Therefore if we - * navigate backwards, the last document we encounter that satisfies our query (over the - * timestamp) is the first document we must scan from to answer the query. - * - * Why isn't this a normal reverse table scan, you may ask? We could be correct if we used a - * normal reverse collection scan. However, that's not fast enough. Since we know all - * documents are oriented on disk in insertion order, we know all documents in one extent were - * inserted before documents in a subsequent extent. As such we can skip through entire extents - * looking only at the first document. - * - * Why is this a stage? Because we want to yield, and we want to be notified of RecordId - * invalidations. :( - */ - class OplogStart : public PlanStage { - public: - // Does not take ownership. - OplogStart(OperationContext* txn, - const Collection* collection, - MatchExpression* filter, - WorkingSet* ws); - virtual ~OplogStart(); +/** + * OplogStart walks a collection backwards to find the first object in the collection that + * matches the query. It's used by replication to efficiently find where the oplog should be + * replayed from. + * + * The oplog is always a capped collection. In capped collections, documents are oriented on + * disk according to insertion order. The oplog inserts documents with increasing timestamps. + * Queries on the oplog look for entries that are after a certain time. Therefore if we + * navigate backwards, the last document we encounter that satisfies our query (over the + * timestamp) is the first document we must scan from to answer the query. + * + * Why isn't this a normal reverse table scan, you may ask? We could be correct if we used a + * normal reverse collection scan. However, that's not fast enough. Since we know all + * documents are oriented on disk in insertion order, we know all documents in one extent were + * inserted before documents in a subsequent extent. As such we can skip through entire extents + * looking only at the first document. + * + * Why is this a stage? Because we want to yield, and we want to be notified of RecordId + * invalidations. :( + */ +class OplogStart : public PlanStage { +public: + // Does not take ownership. + OplogStart(OperationContext* txn, + const Collection* collection, + MatchExpression* filter, + WorkingSet* ws); + virtual ~OplogStart(); - virtual StageState work(WorkingSetID* out); - virtual bool isEOF(); + virtual StageState work(WorkingSetID* out); + virtual bool isEOF(); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); - virtual std::vector<PlanStage*> getChildren() const; + virtual std::vector<PlanStage*> getChildren() const; - // Returns empty PlanStageStats object - virtual PlanStageStats* getStats(); + // Returns empty PlanStageStats object + virtual PlanStageStats* getStats(); - // - // Exec stats -- do not call these for the oplog start stage. - // - virtual const CommonStats* getCommonStats() const { return NULL; } + // + // Exec stats -- do not call these for the oplog start stage. + // + virtual const CommonStats* getCommonStats() const { + return NULL; + } - virtual const SpecificStats* getSpecificStats() const { return NULL; } + virtual const SpecificStats* getSpecificStats() const { + return NULL; + } - virtual StageType stageType() const { return STAGE_OPLOG_START; } + virtual StageType stageType() const { + return STAGE_OPLOG_START; + } - // For testing only. - void setBackwardsScanTime(int newTime) { _backwardsScanTime = newTime; } - bool isExtentHopping() { return _extentHopping; } - bool isBackwardsScanning() { return _backwardsScanning; } + // For testing only. + void setBackwardsScanTime(int newTime) { + _backwardsScanTime = newTime; + } + bool isExtentHopping() { + return _extentHopping; + } + bool isBackwardsScanning() { + return _backwardsScanning; + } - static const char* kStageType; + static const char* kStageType; - private: - StageState workBackwardsScan(WorkingSetID* out); +private: + StageState workBackwardsScan(WorkingSetID* out); - void switchToExtentHopping(); + void switchToExtentHopping(); - StageState workExtentHopping(WorkingSetID* out); + StageState workExtentHopping(WorkingSetID* out); - // transactional context for read locks. Not owned by us - OperationContext* _txn; + // transactional context for read locks. Not owned by us + OperationContext* _txn; - // If we're backwards scanning we just punt to a collscan. - std::unique_ptr<CollectionScan> _cs; + // If we're backwards scanning we just punt to a collscan. + std::unique_ptr<CollectionScan> _cs; - // This is only used for the extent hopping scan. - std::vector<std::unique_ptr<RecordCursor>> _subIterators; + // This is only used for the extent hopping scan. + std::vector<std::unique_ptr<RecordCursor>> _subIterators; - // Have we done our heavy init yet? - bool _needInit; + // Have we done our heavy init yet? + bool _needInit; - // Our first state: going backwards via a collscan. - bool _backwardsScanning; + // Our first state: going backwards via a collscan. + bool _backwardsScanning; - // Our second state: hopping backwards extent by extent. - bool _extentHopping; + // Our second state: hopping backwards extent by extent. + bool _extentHopping; - // Our final state: done. - bool _done; + // Our final state: done. + bool _done; - const Collection* _collection; + const Collection* _collection; - // We only go backwards via a collscan for a few seconds. - Timer _timer; + // We only go backwards via a collscan for a few seconds. + Timer _timer; - // WorkingSet is not owned by us. - WorkingSet* _workingSet; + // WorkingSet is not owned by us. + WorkingSet* _workingSet; - std::string _ns; + std::string _ns; - MatchExpression* _filter; + MatchExpression* _filter; - static int _backwardsScanTime; - }; + static int _backwardsScanTime; +}; } // namespace mongo diff --git a/src/mongo/db/exec/or.cpp b/src/mongo/db/exec/or.cpp index a7d370c41fa..2513635db1d 100644 --- a/src/mongo/db/exec/or.cpp +++ b/src/mongo/db/exec/or.cpp @@ -35,171 +35,172 @@ namespace mongo { - using std::unique_ptr; - using std::vector; +using std::unique_ptr; +using std::vector; - // static - const char* OrStage::kStageType = "OR"; +// static +const char* OrStage::kStageType = "OR"; - OrStage::OrStage(WorkingSet* ws, bool dedup, const MatchExpression* filter) - : _ws(ws), _filter(filter), _currentChild(0), _dedup(dedup), _commonStats(kStageType) { } +OrStage::OrStage(WorkingSet* ws, bool dedup, const MatchExpression* filter) + : _ws(ws), _filter(filter), _currentChild(0), _dedup(dedup), _commonStats(kStageType) {} - OrStage::~OrStage() { - for (size_t i = 0; i < _children.size(); ++i) { - delete _children[i]; - } +OrStage::~OrStage() { + for (size_t i = 0; i < _children.size(); ++i) { + delete _children[i]; } +} - void OrStage::addChild(PlanStage* child) { _children.push_back(child); } - - bool OrStage::isEOF() { return _currentChild >= _children.size(); } +void OrStage::addChild(PlanStage* child) { + _children.push_back(child); +} - PlanStage::StageState OrStage::work(WorkingSetID* out) { - ++_commonStats.works; +bool OrStage::isEOF() { + return _currentChild >= _children.size(); +} - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); +PlanStage::StageState OrStage::work(WorkingSetID* out) { + ++_commonStats.works; - if (isEOF()) { return PlanStage::IS_EOF; } + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); - WorkingSetID id = WorkingSet::INVALID_ID; - StageState childStatus = _children[_currentChild]->work(&id); + if (isEOF()) { + return PlanStage::IS_EOF; + } - if (PlanStage::ADVANCED == childStatus) { - WorkingSetMember* member = _ws->get(id); + WorkingSetID id = WorkingSet::INVALID_ID; + StageState childStatus = _children[_currentChild]->work(&id); - // If we're deduping (and there's something to dedup by) - if (_dedup && member->hasLoc()) { - ++_specificStats.dupsTested; + if (PlanStage::ADVANCED == childStatus) { + WorkingSetMember* member = _ws->get(id); - // ...and we've seen the RecordId before - if (_seen.end() != _seen.find(member->loc)) { - // ...drop it. - ++_specificStats.dupsDropped; - _ws->free(id); - ++_commonStats.needTime; - return PlanStage::NEED_TIME; - } - else { - // Otherwise, note that we've seen it. - _seen.insert(member->loc); - } - } + // If we're deduping (and there's something to dedup by) + if (_dedup && member->hasLoc()) { + ++_specificStats.dupsTested; - if (Filter::passes(member, _filter)) { - // Match! return it. - *out = id; - ++_commonStats.advanced; - return PlanStage::ADVANCED; - } - else { - // Does not match, try again. + // ...and we've seen the RecordId before + if (_seen.end() != _seen.find(member->loc)) { + // ...drop it. + ++_specificStats.dupsDropped; _ws->free(id); ++_commonStats.needTime; return PlanStage::NEED_TIME; + } else { + // Otherwise, note that we've seen it. + _seen.insert(member->loc); } } - else if (PlanStage::IS_EOF == childStatus) { - // Done with _currentChild, move to the next one. - ++_currentChild; - // Maybe we're out of children. - if (isEOF()) { - return PlanStage::IS_EOF; - } - else { - ++_commonStats.needTime; - return PlanStage::NEED_TIME; - } - } - else if (PlanStage::FAILURE == childStatus || PlanStage::DEAD == childStatus) { + if (Filter::passes(member, _filter)) { + // Match! return it. *out = id; - // If a stage fails, it may create a status WSM to indicate why it - // failed, in which case 'id' is valid. If ID is invalid, we - // create our own error message. - if (WorkingSet::INVALID_ID == id) { - mongoutils::str::stream ss; - ss << "OR stage failed to read in results from child " << _currentChild; - Status status(ErrorCodes::InternalError, ss); - *out = WorkingSetCommon::allocateStatusMember( _ws, status); - } - return childStatus; + ++_commonStats.advanced; + return PlanStage::ADVANCED; + } else { + // Does not match, try again. + _ws->free(id); + ++_commonStats.needTime; + return PlanStage::NEED_TIME; } - else if (PlanStage::NEED_TIME == childStatus) { + } else if (PlanStage::IS_EOF == childStatus) { + // Done with _currentChild, move to the next one. + ++_currentChild; + + // Maybe we're out of children. + if (isEOF()) { + return PlanStage::IS_EOF; + } else { ++_commonStats.needTime; + return PlanStage::NEED_TIME; } - else if (PlanStage::NEED_YIELD == childStatus) { - ++_commonStats.needYield; - *out = id; + } else if (PlanStage::FAILURE == childStatus || PlanStage::DEAD == childStatus) { + *out = id; + // If a stage fails, it may create a status WSM to indicate why it + // failed, in which case 'id' is valid. If ID is invalid, we + // create our own error message. + if (WorkingSet::INVALID_ID == id) { + mongoutils::str::stream ss; + ss << "OR stage failed to read in results from child " << _currentChild; + Status status(ErrorCodes::InternalError, ss); + *out = WorkingSetCommon::allocateStatusMember(_ws, status); } - - // NEED_TIME, ERROR, NEED_YIELD, pass them up. return childStatus; + } else if (PlanStage::NEED_TIME == childStatus) { + ++_commonStats.needTime; + } else if (PlanStage::NEED_YIELD == childStatus) { + ++_commonStats.needYield; + *out = id; } - void OrStage::saveState() { - ++_commonStats.yields; - for (size_t i = 0; i < _children.size(); ++i) { - _children[i]->saveState(); - } - } + // NEED_TIME, ERROR, NEED_YIELD, pass them up. + return childStatus; +} - void OrStage::restoreState(OperationContext* opCtx) { - ++_commonStats.unyields; - for (size_t i = 0; i < _children.size(); ++i) { - _children[i]->restoreState(opCtx); - } +void OrStage::saveState() { + ++_commonStats.yields; + for (size_t i = 0; i < _children.size(); ++i) { + _children[i]->saveState(); } +} - void OrStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { - ++_commonStats.invalidates; +void OrStage::restoreState(OperationContext* opCtx) { + ++_commonStats.unyields; + for (size_t i = 0; i < _children.size(); ++i) { + _children[i]->restoreState(opCtx); + } +} - if (isEOF()) { return; } +void OrStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + ++_commonStats.invalidates; - for (size_t i = 0; i < _children.size(); ++i) { - _children[i]->invalidate(txn, dl, type); - } - - // If we see DL again it is not the same record as it once was so we still want to - // return it. - if (_dedup && INVALIDATION_DELETION == type) { - unordered_set<RecordId, RecordId::Hasher>::iterator it = _seen.find(dl); - if (_seen.end() != it) { - ++_specificStats.locsForgotten; - _seen.erase(dl); - } - } + if (isEOF()) { + return; } - vector<PlanStage*> OrStage::getChildren() const { - return _children; + for (size_t i = 0; i < _children.size(); ++i) { + _children[i]->invalidate(txn, dl, type); } - PlanStageStats* OrStage::getStats() { - _commonStats.isEOF = isEOF(); - - // Add a BSON representation of the filter to the stats tree, if there is one. - if (NULL != _filter) { - BSONObjBuilder bob; - _filter->toBSON(&bob); - _commonStats.filter = bob.obj(); + // If we see DL again it is not the same record as it once was so we still want to + // return it. + if (_dedup && INVALIDATION_DELETION == type) { + unordered_set<RecordId, RecordId::Hasher>::iterator it = _seen.find(dl); + if (_seen.end() != it) { + ++_specificStats.locsForgotten; + _seen.erase(dl); } + } +} - unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_OR)); - ret->specific.reset(new OrStats(_specificStats)); - for (size_t i = 0; i < _children.size(); ++i) { - ret->children.push_back(_children[i]->getStats()); - } +vector<PlanStage*> OrStage::getChildren() const { + return _children; +} - return ret.release(); - } +PlanStageStats* OrStage::getStats() { + _commonStats.isEOF = isEOF(); - const CommonStats* OrStage::getCommonStats() const { - return &_commonStats; + // Add a BSON representation of the filter to the stats tree, if there is one. + if (NULL != _filter) { + BSONObjBuilder bob; + _filter->toBSON(&bob); + _commonStats.filter = bob.obj(); } - const SpecificStats* OrStage::getSpecificStats() const { - return &_specificStats; + unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_OR)); + ret->specific.reset(new OrStats(_specificStats)); + for (size_t i = 0; i < _children.size(); ++i) { + ret->children.push_back(_children[i]->getStats()); } + return ret.release(); +} + +const CommonStats* OrStage::getCommonStats() const { + return &_commonStats; +} + +const SpecificStats* OrStage::getSpecificStats() const { + return &_specificStats; +} + } // namespace mongo diff --git a/src/mongo/db/exec/or.h b/src/mongo/db/exec/or.h index 3ea2d55a466..6e9250db9bd 100644 --- a/src/mongo/db/exec/or.h +++ b/src/mongo/db/exec/or.h @@ -36,62 +36,64 @@ namespace mongo { - /** - * This stage outputs the union of its children. It optionally deduplicates on RecordId. - * - * Preconditions: Valid RecordId. - * - * If we're deduping, we may fail to dedup any invalidated RecordId properly. - */ - class OrStage : public PlanStage { - public: - OrStage(WorkingSet* ws, bool dedup, const MatchExpression* filter); - virtual ~OrStage(); +/** + * This stage outputs the union of its children. It optionally deduplicates on RecordId. + * + * Preconditions: Valid RecordId. + * + * If we're deduping, we may fail to dedup any invalidated RecordId properly. + */ +class OrStage : public PlanStage { +public: + OrStage(WorkingSet* ws, bool dedup, const MatchExpression* filter); + virtual ~OrStage(); - void addChild(PlanStage* child); + void addChild(PlanStage* child); - virtual bool isEOF(); + virtual bool isEOF(); - virtual StageState work(WorkingSetID* out); + virtual StageState work(WorkingSetID* out); - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - virtual std::vector<PlanStage*> getChildren() const; + virtual std::vector<PlanStage*> getChildren() const; - virtual StageType stageType() const { return STAGE_OR; } + virtual StageType stageType() const { + return STAGE_OR; + } - virtual PlanStageStats* getStats(); + virtual PlanStageStats* getStats(); - virtual const CommonStats* getCommonStats() const; + virtual const CommonStats* getCommonStats() const; - virtual const SpecificStats* getSpecificStats() const; + virtual const SpecificStats* getSpecificStats() const; - static const char* kStageType; + static const char* kStageType; - private: - // Not owned by us. - WorkingSet* _ws; +private: + // Not owned by us. + WorkingSet* _ws; - // The filter is not owned by us. - const MatchExpression* _filter; + // The filter is not owned by us. + const MatchExpression* _filter; - // Owned by us. - std::vector<PlanStage*> _children; + // Owned by us. + std::vector<PlanStage*> _children; - // Which of _children are we calling work(...) on now? - size_t _currentChild; + // Which of _children are we calling work(...) on now? + size_t _currentChild; - // True if we dedup on RecordId, false otherwise. - bool _dedup; + // True if we dedup on RecordId, false otherwise. + bool _dedup; - // Which RecordIds have we returned? - unordered_set<RecordId, RecordId::Hasher> _seen; + // Which RecordIds have we returned? + unordered_set<RecordId, RecordId::Hasher> _seen; - // Stats - CommonStats _commonStats; - OrStats _specificStats; - }; + // Stats + CommonStats _commonStats; + OrStats _specificStats; +}; } // namespace mongo diff --git a/src/mongo/db/exec/pipeline_proxy.cpp b/src/mongo/db/exec/pipeline_proxy.cpp index d7082cacdbc..9ca0fe788b5 100644 --- a/src/mongo/db/exec/pipeline_proxy.cpp +++ b/src/mongo/db/exec/pipeline_proxy.cpp @@ -36,107 +36,106 @@ namespace mongo { - using boost::intrusive_ptr; - using std::shared_ptr; - using std::vector; - - const char* PipelineProxyStage::kStageType = "PIPELINE_PROXY"; - - PipelineProxyStage::PipelineProxyStage(intrusive_ptr<Pipeline> pipeline, - const std::shared_ptr<PlanExecutor>& child, - WorkingSet* ws) - : _pipeline(pipeline) - , _includeMetaData(_pipeline->getContext()->inShard) // send metadata to merger - , _childExec(child) - , _ws(ws) - {} - - PlanStage::StageState PipelineProxyStage::work(WorkingSetID* out) { - if (!out) { - return PlanStage::FAILURE; - } - - if (!_stash.empty()) { - *out = _ws->allocate(); - WorkingSetMember* member = _ws->get(*out); - member->obj = Snapshotted<BSONObj>(SnapshotId(), _stash.back()); - _stash.pop_back(); - member->state = WorkingSetMember::OWNED_OBJ; - return PlanStage::ADVANCED; - } - - if (boost::optional<BSONObj> next = getNextBson()) { - *out = _ws->allocate(); - WorkingSetMember* member = _ws->get(*out); - member->obj = Snapshotted<BSONObj>(SnapshotId(), *next); - member->state = WorkingSetMember::OWNED_OBJ; - return PlanStage::ADVANCED; - } - - return PlanStage::IS_EOF; +using boost::intrusive_ptr; +using std::shared_ptr; +using std::vector; + +const char* PipelineProxyStage::kStageType = "PIPELINE_PROXY"; + +PipelineProxyStage::PipelineProxyStage(intrusive_ptr<Pipeline> pipeline, + const std::shared_ptr<PlanExecutor>& child, + WorkingSet* ws) + : _pipeline(pipeline), + _includeMetaData(_pipeline->getContext()->inShard) // send metadata to merger + , + _childExec(child), + _ws(ws) {} + +PlanStage::StageState PipelineProxyStage::work(WorkingSetID* out) { + if (!out) { + return PlanStage::FAILURE; } - bool PipelineProxyStage::isEOF() { - if (!_stash.empty()) - return false; - - if (boost::optional<BSONObj> next = getNextBson()) { - _stash.push_back(*next); - return false; - } - - return true; + if (!_stash.empty()) { + *out = _ws->allocate(); + WorkingSetMember* member = _ws->get(*out); + member->obj = Snapshotted<BSONObj>(SnapshotId(), _stash.back()); + _stash.pop_back(); + member->state = WorkingSetMember::OWNED_OBJ; + return PlanStage::ADVANCED; } - void PipelineProxyStage::invalidate(OperationContext* txn, - const RecordId& dl, - InvalidationType type) { - // propagate to child executor if still in use - if (std::shared_ptr<PlanExecutor> exec = _childExec.lock()) { - exec->invalidate(txn, dl, type); - } + if (boost::optional<BSONObj> next = getNextBson()) { + *out = _ws->allocate(); + WorkingSetMember* member = _ws->get(*out); + member->obj = Snapshotted<BSONObj>(SnapshotId(), *next); + member->state = WorkingSetMember::OWNED_OBJ; + return PlanStage::ADVANCED; } - void PipelineProxyStage::saveState() { - _pipeline->getContext()->opCtx = NULL; - } + return PlanStage::IS_EOF; +} - void PipelineProxyStage::restoreState(OperationContext* opCtx) { - invariant(_pipeline->getContext()->opCtx == NULL); - _pipeline->getContext()->opCtx = opCtx; - } +bool PipelineProxyStage::isEOF() { + if (!_stash.empty()) + return false; - void PipelineProxyStage::pushBack(const BSONObj& obj) { - _stash.push_back(obj); + if (boost::optional<BSONObj> next = getNextBson()) { + _stash.push_back(*next); + return false; } - vector<PlanStage*> PipelineProxyStage::getChildren() const { - vector<PlanStage*> empty; - return empty; - } + return true; +} - PlanStageStats* PipelineProxyStage::getStats() { - std::unique_ptr<PlanStageStats> ret(new PlanStageStats(CommonStats(kStageType), - STAGE_PIPELINE_PROXY)); - ret->specific.reset(new CollectionScanStats()); - return ret.release(); +void PipelineProxyStage::invalidate(OperationContext* txn, + const RecordId& dl, + InvalidationType type) { + // propagate to child executor if still in use + if (std::shared_ptr<PlanExecutor> exec = _childExec.lock()) { + exec->invalidate(txn, dl, type); } - - boost::optional<BSONObj> PipelineProxyStage::getNextBson() { - if (boost::optional<Document> next = _pipeline->output()->getNext()) { - if (_includeMetaData) { - return next->toBsonWithMetaData(); - } - else { - return next->toBson(); - } +} + +void PipelineProxyStage::saveState() { + _pipeline->getContext()->opCtx = NULL; +} + +void PipelineProxyStage::restoreState(OperationContext* opCtx) { + invariant(_pipeline->getContext()->opCtx == NULL); + _pipeline->getContext()->opCtx = opCtx; +} + +void PipelineProxyStage::pushBack(const BSONObj& obj) { + _stash.push_back(obj); +} + +vector<PlanStage*> PipelineProxyStage::getChildren() const { + vector<PlanStage*> empty; + return empty; +} + +PlanStageStats* PipelineProxyStage::getStats() { + std::unique_ptr<PlanStageStats> ret( + new PlanStageStats(CommonStats(kStageType), STAGE_PIPELINE_PROXY)); + ret->specific.reset(new CollectionScanStats()); + return ret.release(); +} + +boost::optional<BSONObj> PipelineProxyStage::getNextBson() { + if (boost::optional<Document> next = _pipeline->output()->getNext()) { + if (_includeMetaData) { + return next->toBsonWithMetaData(); + } else { + return next->toBson(); } - - return boost::none; } - shared_ptr<PlanExecutor> PipelineProxyStage::getChildExecutor() { - return _childExec.lock(); - } + return boost::none; +} + +shared_ptr<PlanExecutor> PipelineProxyStage::getChildExecutor() { + return _childExec.lock(); +} -} // namespace mongo +} // namespace mongo diff --git a/src/mongo/db/exec/pipeline_proxy.h b/src/mongo/db/exec/pipeline_proxy.h index 68a33c0170e..ac501b70191 100644 --- a/src/mongo/db/exec/pipeline_proxy.h +++ b/src/mongo/db/exec/pipeline_proxy.h @@ -39,67 +39,73 @@ namespace mongo { - /** - * Stage for pulling results out from an aggregation pipeline. - */ - class PipelineProxyStage : public PlanStage { - public: - PipelineProxyStage(boost::intrusive_ptr<Pipeline> pipeline, - const std::shared_ptr<PlanExecutor>& child, - WorkingSet* ws); +/** + * Stage for pulling results out from an aggregation pipeline. + */ +class PipelineProxyStage : public PlanStage { +public: + PipelineProxyStage(boost::intrusive_ptr<Pipeline> pipeline, + const std::shared_ptr<PlanExecutor>& child, + WorkingSet* ws); - virtual PlanStage::StageState work(WorkingSetID* out); + virtual PlanStage::StageState work(WorkingSetID* out); - virtual bool isEOF(); + virtual bool isEOF(); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - // - // Manage our OperationContext. We intentionally don't propagate to the child - // Runner as that is handled by DocumentSourceCursor as it needs to. - // - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); + // + // Manage our OperationContext. We intentionally don't propagate to the child + // Runner as that is handled by DocumentSourceCursor as it needs to. + // + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); - /** - * Make obj the next object returned by getNext(). - */ - void pushBack(const BSONObj& obj); + /** + * Make obj the next object returned by getNext(). + */ + void pushBack(const BSONObj& obj); - /** - * Return a shared pointer to the PlanExecutor that feeds the pipeline. The returned - * pointer may be NULL. - */ - std::shared_ptr<PlanExecutor> getChildExecutor(); + /** + * Return a shared pointer to the PlanExecutor that feeds the pipeline. The returned + * pointer may be NULL. + */ + std::shared_ptr<PlanExecutor> getChildExecutor(); - // Returns empty PlanStageStats object - virtual PlanStageStats* getStats(); + // Returns empty PlanStageStats object + virtual PlanStageStats* getStats(); - // Not used. - virtual CommonStats* getCommonStats() const { return NULL; } + // Not used. + virtual CommonStats* getCommonStats() const { + return NULL; + } - // Not used. - virtual SpecificStats* getSpecificStats() const { return NULL; } + // Not used. + virtual SpecificStats* getSpecificStats() const { + return NULL; + } - // Not used. - virtual std::vector<PlanStage*> getChildren() const; + // Not used. + virtual std::vector<PlanStage*> getChildren() const; - // Not used. - virtual StageType stageType() const { return STAGE_PIPELINE_PROXY; } + // Not used. + virtual StageType stageType() const { + return STAGE_PIPELINE_PROXY; + } - static const char* kStageType; + static const char* kStageType; - private: - boost::optional<BSONObj> getNextBson(); +private: + boost::optional<BSONObj> getNextBson(); - // Things in the _stash sould be returned before pulling items from _pipeline. - const boost::intrusive_ptr<Pipeline> _pipeline; - std::vector<BSONObj> _stash; - const bool _includeMetaData; - std::weak_ptr<PlanExecutor> _childExec; + // Things in the _stash sould be returned before pulling items from _pipeline. + const boost::intrusive_ptr<Pipeline> _pipeline; + std::vector<BSONObj> _stash; + const bool _includeMetaData; + std::weak_ptr<PlanExecutor> _childExec; - // Not owned by us. - WorkingSet* _ws; - }; + // Not owned by us. + WorkingSet* _ws; +}; -} // namespace mongo +} // namespace mongo diff --git a/src/mongo/db/exec/plan_stage.h b/src/mongo/db/exec/plan_stage.h index 07536817ca6..a096664b01d 100644 --- a/src/mongo/db/exec/plan_stage.h +++ b/src/mongo/db/exec/plan_stage.h @@ -34,255 +34,247 @@ namespace mongo { - class Collection; - class RecordId; - class OperationContext; +class Collection; +class RecordId; +class OperationContext; + +/** + * A PlanStage ("stage") is the basic building block of a "Query Execution Plan." A stage is + * the smallest piece of machinery used in executing a compiled query. Stages either access + * data (from a collection or an index) to create a stream of results, or transform a stream of + * results (e.g. AND, OR, SORT) to create a stream of results. + * + * Stages have zero or more input streams but only one output stream. Data-accessing stages are + * leaves and data-transforming stages have children. Stages can be connected together to form + * a tree which is then executed (see plan_executor.h) to solve a query. + * + * A stage's input and output are each typed. Only stages with compatible types can be + * connected. + * + * All of the stages of a QEP share a WorkingSet (see working_set.h). Data source stages + * allocate a slot in the WorkingSet, fill the slot with data, and return the ID of that slot. + * Subsequent stages fetch a WorkingSetElement by its ID and operate on the enclosed data. + * + * Stages do nothing unless work() is called. work() is a request to the stage to consume one + * unit of input. Some stages (e.g. AND, SORT) require many calls to work() before generating + * output as they must consume many units of input. These stages will inform the caller that + * they need more time, and work() must be called again in order to produce an output. + * + * Every stage of a query implements the PlanStage interface. Queries perform a unit of work + * and report on their subsequent status; see StatusCode for possible states. Query results are + * passed through the WorkingSet interface; see working_set.h for details. + * + * All synchronization is the responsibility of the caller. Queries must be told to yield with + * saveState() if any underlying database state changes. If saveState() is called, + * restoreState() must be called again before any work() is done. + * + * Here is a very simple usage example: + * + * WorkingSet workingSet; + * PlanStage* rootStage = makeQueryPlan(&workingSet, ...); + * while (!rootStage->isEOF()) { + * WorkingSetID result; + * switch(rootStage->work(&result)) { + * case PlanStage::ADVANCED: + * // do something with result + * WorkingSetMember* member = workingSet.get(result); + * cout << "Result: " << member->obj << std::endl; + * break; + * case PlanStage::IS_EOF: + * // All done. Will fall out of while loop. + * break; + * case PlanStage::NEED_TIME: + * // Need more time. + * break; + * case PlanStage::FAILURE: + * // Throw exception or return error + * break; + * } + * + * if (shouldYield) { + * // Occasionally yield. + * stage->saveState(); + * // Do work that requires a yield here (execute other plans, insert, delete, etc.). + * stage->restoreState(); + * } + * } + */ +class PlanStage { +public: + virtual ~PlanStage() {} /** - * A PlanStage ("stage") is the basic building block of a "Query Execution Plan." A stage is - * the smallest piece of machinery used in executing a compiled query. Stages either access - * data (from a collection or an index) to create a stream of results, or transform a stream of - * results (e.g. AND, OR, SORT) to create a stream of results. - * - * Stages have zero or more input streams but only one output stream. Data-accessing stages are - * leaves and data-transforming stages have children. Stages can be connected together to form - * a tree which is then executed (see plan_executor.h) to solve a query. - * - * A stage's input and output are each typed. Only stages with compatible types can be - * connected. - * - * All of the stages of a QEP share a WorkingSet (see working_set.h). Data source stages - * allocate a slot in the WorkingSet, fill the slot with data, and return the ID of that slot. - * Subsequent stages fetch a WorkingSetElement by its ID and operate on the enclosed data. - * - * Stages do nothing unless work() is called. work() is a request to the stage to consume one - * unit of input. Some stages (e.g. AND, SORT) require many calls to work() before generating - * output as they must consume many units of input. These stages will inform the caller that - * they need more time, and work() must be called again in order to produce an output. - * - * Every stage of a query implements the PlanStage interface. Queries perform a unit of work - * and report on their subsequent status; see StatusCode for possible states. Query results are - * passed through the WorkingSet interface; see working_set.h for details. - * - * All synchronization is the responsibility of the caller. Queries must be told to yield with - * saveState() if any underlying database state changes. If saveState() is called, - * restoreState() must be called again before any work() is done. - * - * Here is a very simple usage example: - * - * WorkingSet workingSet; - * PlanStage* rootStage = makeQueryPlan(&workingSet, ...); - * while (!rootStage->isEOF()) { - * WorkingSetID result; - * switch(rootStage->work(&result)) { - * case PlanStage::ADVANCED: - * // do something with result - * WorkingSetMember* member = workingSet.get(result); - * cout << "Result: " << member->obj << std::endl; - * break; - * case PlanStage::IS_EOF: - * // All done. Will fall out of while loop. - * break; - * case PlanStage::NEED_TIME: - * // Need more time. - * break; - * case PlanStage::FAILURE: - * // Throw exception or return error - * break; - * } - * - * if (shouldYield) { - * // Occasionally yield. - * stage->saveState(); - * // Do work that requires a yield here (execute other plans, insert, delete, etc.). - * stage->restoreState(); - * } - * } + * All possible return values of work(...) */ - class PlanStage { - public: - virtual ~PlanStage() { } + enum StageState { + // work(...) has returned a new result in its out parameter. The caller must free it + // from the working set when done with it. + ADVANCED, - /** - * All possible return values of work(...) - */ - enum StageState { - // work(...) has returned a new result in its out parameter. The caller must free it - // from the working set when done with it. - ADVANCED, + // work(...) won't do anything more. isEOF() will also be true. There is nothing + // output in the out parameter. + IS_EOF, - // work(...) won't do anything more. isEOF() will also be true. There is nothing - // output in the out parameter. - IS_EOF, + // work(...) needs more time to product a result. Call work(...) again. There is + // nothing output in the out parameter. + NEED_TIME, - // work(...) needs more time to product a result. Call work(...) again. There is - // nothing output in the out parameter. - NEED_TIME, - - // The storage engine says we need to yield, possibly to fetch a record from disk, or - // due to an aborted transaction in the storage layer. - // - // Full yield request semantics: - // - // Each stage that receives a NEED_YIELD from a child must propagate the NEED_YIELD up - // and perform no work. - // - // If a yield is requested due to a WriteConflict, the out parameter of work(...) should - // be populated with WorkingSet::INVALID_ID. If it is illegal to yield, a - // WriteConflictException will be thrown. - // - // A yield-requesting stage populates the out parameter of work(...) with a WSID that - // refers to a WSM with a Fetcher*. If it is illegal to yield, this is ignored. This - // difference in behavior can be removed once SERVER-16051 is resolved. - // - // The plan executor is responsible for yielding and, if requested, paging in the data - // upon receipt of a NEED_YIELD. The plan executor does NOT free the WSID of the - // requested fetch. The stage that requested the fetch holds the WSID of the loc it - // wants fetched. On the next call to work() that stage can assume a fetch was performed - // on the WSM that the held WSID refers to. - NEED_YIELD, + // The storage engine says we need to yield, possibly to fetch a record from disk, or + // due to an aborted transaction in the storage layer. + // + // Full yield request semantics: + // + // Each stage that receives a NEED_YIELD from a child must propagate the NEED_YIELD up + // and perform no work. + // + // If a yield is requested due to a WriteConflict, the out parameter of work(...) should + // be populated with WorkingSet::INVALID_ID. If it is illegal to yield, a + // WriteConflictException will be thrown. + // + // A yield-requesting stage populates the out parameter of work(...) with a WSID that + // refers to a WSM with a Fetcher*. If it is illegal to yield, this is ignored. This + // difference in behavior can be removed once SERVER-16051 is resolved. + // + // The plan executor is responsible for yielding and, if requested, paging in the data + // upon receipt of a NEED_YIELD. The plan executor does NOT free the WSID of the + // requested fetch. The stage that requested the fetch holds the WSID of the loc it + // wants fetched. On the next call to work() that stage can assume a fetch was performed + // on the WSM that the held WSID refers to. + NEED_YIELD, - // Something went wrong but it's not an internal error. Perhaps our collection was - // dropped or state deleted. - DEAD, + // Something went wrong but it's not an internal error. Perhaps our collection was + // dropped or state deleted. + DEAD, - // Something has gone unrecoverably wrong. Stop running this query. - // If the out parameter does not refer to an invalid working set member, - // call WorkingSetCommon::getStatusMemberObject() to get details on the failure. - // Any class implementing this interface must set the WSID out parameter to - // INVALID_ID or a valid WSM ID if FAILURE is returned. - FAILURE, - }; + // Something has gone unrecoverably wrong. Stop running this query. + // If the out parameter does not refer to an invalid working set member, + // call WorkingSetCommon::getStatusMemberObject() to get details on the failure. + // Any class implementing this interface must set the WSID out parameter to + // INVALID_ID or a valid WSM ID if FAILURE is returned. + FAILURE, + }; - static std::string stateStr(const StageState& state) { - if (ADVANCED == state) { - return "ADVANCED"; - } - else if (IS_EOF == state) { - return "IS_EOF"; - } - else if (NEED_TIME == state) { - return "NEED_TIME"; - } - else if (NEED_YIELD == state) { - return "NEED_YIELD"; - } - else if (DEAD == state) { - return "DEAD"; - } - else { - verify(FAILURE == state); - return "FAILURE"; - } + static std::string stateStr(const StageState& state) { + if (ADVANCED == state) { + return "ADVANCED"; + } else if (IS_EOF == state) { + return "IS_EOF"; + } else if (NEED_TIME == state) { + return "NEED_TIME"; + } else if (NEED_YIELD == state) { + return "NEED_YIELD"; + } else if (DEAD == state) { + return "DEAD"; + } else { + verify(FAILURE == state); + return "FAILURE"; } + } - /** - * Perform a unit of work on the query. Ask the stage to produce the next unit of output. - * Stage returns StageState::ADVANCED if *out is set to the next unit of output. Otherwise, - * returns another value of StageState to indicate the stage's status. - */ - virtual StageState work(WorkingSetID* out) = 0; - - /** - * Returns true if no more work can be done on the query / out of results. - */ - virtual bool isEOF() = 0; + /** + * Perform a unit of work on the query. Ask the stage to produce the next unit of output. + * Stage returns StageState::ADVANCED if *out is set to the next unit of output. Otherwise, + * returns another value of StageState to indicate the stage's status. + */ + virtual StageState work(WorkingSetID* out) = 0; - // - // Yielding and isolation semantics: - // - // Any data that is not inserted, deleted, or modified during a yield will be faithfully - // returned by a query that should return that data. - // - // Any data inserted, deleted, or modified during a yield that should be returned by a query - // may or may not be returned by that query. The query could return: nothing; the data - // before; the data after; or both the data before and the data after. - // - // In short, there is no isolation between a query and an insert/delete/update. AKA, - // READ_UNCOMMITTED. - // + /** + * Returns true if no more work can be done on the query / out of results. + */ + virtual bool isEOF() = 0; - /** - * Notifies the stage that all locks are about to be released. The stage must save any - * state required to resume where it was before saveState was called. - * - * Stages must be able to handle multiple calls to saveState() in a row without a call to - * restoreState() in between. - */ - virtual void saveState() = 0; + // + // Yielding and isolation semantics: + // + // Any data that is not inserted, deleted, or modified during a yield will be faithfully + // returned by a query that should return that data. + // + // Any data inserted, deleted, or modified during a yield that should be returned by a query + // may or may not be returned by that query. The query could return: nothing; the data + // before; the data after; or both the data before and the data after. + // + // In short, there is no isolation between a query and an insert/delete/update. AKA, + // READ_UNCOMMITTED. + // - /** - * Notifies the stage that any required locks have been reacquired. The stage must restore - * any saved state and be ready to handle calls to work(). - * - * Can only be called after saveState. - * - * If the stage needs an OperationContext during its execution, it may keep a handle to the - * provided OperationContext (which is valid until the next call to saveState()). - */ - virtual void restoreState(OperationContext* opCtx) = 0; + /** + * Notifies the stage that all locks are about to be released. The stage must save any + * state required to resume where it was before saveState was called. + * + * Stages must be able to handle multiple calls to saveState() in a row without a call to + * restoreState() in between. + */ + virtual void saveState() = 0; - /** - * Notifies a stage that a RecordId is going to be deleted (or in-place updated) so that the - * stage can invalidate or modify any state required to continue processing without this - * RecordId. - * - * Can only be called after a saveState but before a restoreState. - * - * The provided OperationContext should be used if any work needs to be performed during the - * invalidate (as the state of the stage must be saved before any calls to invalidate, the - * stage's own OperationContext is inactive during the invalidate and should not be used). - */ - virtual void invalidate(OperationContext* txn, - const RecordId& dl, - InvalidationType type) = 0; + /** + * Notifies the stage that any required locks have been reacquired. The stage must restore + * any saved state and be ready to handle calls to work(). + * + * Can only be called after saveState. + * + * If the stage needs an OperationContext during its execution, it may keep a handle to the + * provided OperationContext (which is valid until the next call to saveState()). + */ + virtual void restoreState(OperationContext* opCtx) = 0; - /** - * Retrieve a list of this stage's children. This stage keeps ownership of - * its children. - */ - virtual std::vector<PlanStage*> getChildren() const = 0; + /** + * Notifies a stage that a RecordId is going to be deleted (or in-place updated) so that the + * stage can invalidate or modify any state required to continue processing without this + * RecordId. + * + * Can only be called after a saveState but before a restoreState. + * + * The provided OperationContext should be used if any work needs to be performed during the + * invalidate (as the state of the stage must be saved before any calls to invalidate, the + * stage's own OperationContext is inactive during the invalidate and should not be used). + */ + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) = 0; - /** - * What type of stage is this? - */ - virtual StageType stageType() const = 0; + /** + * Retrieve a list of this stage's children. This stage keeps ownership of + * its children. + */ + virtual std::vector<PlanStage*> getChildren() const = 0; - // - // Execution stats. - // + /** + * What type of stage is this? + */ + virtual StageType stageType() const = 0; - /** - * Returns a tree of stats. See plan_stats.h for the details of this structure. If the - * stage has any children it must propagate the request for stats to them. - * - * Creates plan stats tree which has the same topology as the original execution tree, - * but has a separate lifetime. - * - * Caller owns returned pointer. - */ - virtual PlanStageStats* getStats() = 0; + // + // Execution stats. + // - /** - * Get the CommonStats for this stage. The pointer is *not* owned by the caller. - * - * The returned pointer is only valid when the corresponding stage is also valid. - * It must not exist past the stage. If you need the stats to outlive the stage, - * use the getStats(...) method above. - */ - virtual const CommonStats* getCommonStats() const = 0; + /** + * Returns a tree of stats. See plan_stats.h for the details of this structure. If the + * stage has any children it must propagate the request for stats to them. + * + * Creates plan stats tree which has the same topology as the original execution tree, + * but has a separate lifetime. + * + * Caller owns returned pointer. + */ + virtual PlanStageStats* getStats() = 0; - /** - * Get stats specific to this stage. Some stages may not have specific stats, in which - * case they return NULL. The pointer is *not* owned by the caller. - * - * The returned pointer is only valid when the corresponding stage is also valid. - * It must not exist past the stage. If you need the stats to outlive the stage, - * use the getStats(...) method above. - */ - virtual const SpecificStats* getSpecificStats() const = 0; + /** + * Get the CommonStats for this stage. The pointer is *not* owned by the caller. + * + * The returned pointer is only valid when the corresponding stage is also valid. + * It must not exist past the stage. If you need the stats to outlive the stage, + * use the getStats(...) method above. + */ + virtual const CommonStats* getCommonStats() const = 0; - }; + /** + * Get stats specific to this stage. Some stages may not have specific stats, in which + * case they return NULL. The pointer is *not* owned by the caller. + * + * The returned pointer is only valid when the corresponding stage is also valid. + * It must not exist past the stage. If you need the stats to outlive the stage, + * use the getStats(...) method above. + */ + virtual const SpecificStats* getSpecificStats() const = 0; +}; } // namespace mongo diff --git a/src/mongo/db/exec/plan_stats.cpp b/src/mongo/db/exec/plan_stats.cpp index bf079a7113a..0eb3c69b6f2 100644 --- a/src/mongo/db/exec/plan_stats.cpp +++ b/src/mongo/db/exec/plan_stats.cpp @@ -31,19 +31,19 @@ namespace mongo { - void CommonStats::writeExplainTo(BSONObjBuilder* bob) const { - if (NULL == bob) { - return; - } - // potential overflow because original counters are unsigned 64-bit values - bob->append("works", static_cast<long long>(works)); - bob->append("advanced", static_cast<long long>(advanced)); +void CommonStats::writeExplainTo(BSONObjBuilder* bob) const { + if (NULL == bob) { + return; } + // potential overflow because original counters are unsigned 64-bit values + bob->append("works", static_cast<long long>(works)); + bob->append("advanced", static_cast<long long>(advanced)); +} - // forward to CommonStats for now - // TODO: fill in specific stats - void PlanStageStats::writeExplainTo(BSONObjBuilder* bob) const { - common.writeExplainTo(bob); - } +// forward to CommonStats for now +// TODO: fill in specific stats +void PlanStageStats::writeExplainTo(BSONObjBuilder* bob) const { + common.writeExplainTo(bob); +} } // namespace mongo diff --git a/src/mongo/db/exec/plan_stats.h b/src/mongo/db/exec/plan_stats.h index 3504355f071..c3b514260a3 100644 --- a/src/mongo/db/exec/plan_stats.h +++ b/src/mongo/db/exec/plan_stats.h @@ -37,626 +37,614 @@ #include "mongo/db/query/stage_types.h" #include "mongo/platform/cstdint.h" #include "mongo/util/time_support.h" -#include "mongo/util/net/listen.h" // for Listener::getElapsedTimeMillis() +#include "mongo/util/net/listen.h" // for Listener::getElapsedTimeMillis() namespace mongo { +/** + * The interface all specific-to-stage stats provide. + */ +struct SpecificStats { + virtual ~SpecificStats() {} + /** - * The interface all specific-to-stage stats provide. + * Make a deep copy. */ - struct SpecificStats { - virtual ~SpecificStats() { } - - /** - * Make a deep copy. - */ - virtual SpecificStats* clone() const = 0; - }; - - // Every stage has CommonStats. - struct CommonStats { - CommonStats(const char* type) - : stageTypeStr(type), - works(0), - yields(0), - unyields(0), - invalidates(0), - advanced(0), - needTime(0), - needYield(0), - executionTimeMillis(0), - isEOF(false) { } - // String giving the type of the stage. Not owned. - const char* stageTypeStr; - - // Count calls into the stage. - size_t works; - size_t yields; - size_t unyields; - size_t invalidates; - - // How many times was this state the return value of work(...)? - size_t advanced; - size_t needTime; - size_t needYield; - - // BSON representation of a MatchExpression affixed to this node. If there - // is no filter affixed, then 'filter' should be an empty BSONObj. - BSONObj filter; - - // Time elapsed while working inside this stage. - long long executionTimeMillis; - - // TODO: have some way of tracking WSM sizes (or really any series of #s). We can measure - // the size of our inputs and the size of our outputs. We can do a lot with the WS here. - - // TODO: once we've picked a plan, collect different (or additional) stats for display to - // the user, eg. time_t totalTimeSpent; - - // TODO: keep track of the total yield time / fetch time done for a plan. - - bool isEOF; - private: - // Default constructor is illegal. - CommonStats(); - }; - - // The universal container for a stage's stats. - struct PlanStageStats { - PlanStageStats(const CommonStats& c, StageType t) : stageType(t), common(c) { } - - ~PlanStageStats() { - for (size_t i = 0; i < children.size(); ++i) { - delete children[i]; - } + virtual SpecificStats* clone() const = 0; +}; + +// Every stage has CommonStats. +struct CommonStats { + CommonStats(const char* type) + : stageTypeStr(type), + works(0), + yields(0), + unyields(0), + invalidates(0), + advanced(0), + needTime(0), + needYield(0), + executionTimeMillis(0), + isEOF(false) {} + // String giving the type of the stage. Not owned. + const char* stageTypeStr; + + // Count calls into the stage. + size_t works; + size_t yields; + size_t unyields; + size_t invalidates; + + // How many times was this state the return value of work(...)? + size_t advanced; + size_t needTime; + size_t needYield; + + // BSON representation of a MatchExpression affixed to this node. If there + // is no filter affixed, then 'filter' should be an empty BSONObj. + BSONObj filter; + + // Time elapsed while working inside this stage. + long long executionTimeMillis; + + // TODO: have some way of tracking WSM sizes (or really any series of #s). We can measure + // the size of our inputs and the size of our outputs. We can do a lot with the WS here. + + // TODO: once we've picked a plan, collect different (or additional) stats for display to + // the user, eg. time_t totalTimeSpent; + + // TODO: keep track of the total yield time / fetch time done for a plan. + + bool isEOF; + +private: + // Default constructor is illegal. + CommonStats(); +}; + +// The universal container for a stage's stats. +struct PlanStageStats { + PlanStageStats(const CommonStats& c, StageType t) : stageType(t), common(c) {} + + ~PlanStageStats() { + for (size_t i = 0; i < children.size(); ++i) { + delete children[i]; } + } - /** - * Make a deep copy. - */ - PlanStageStats* clone() const { - PlanStageStats* stats = new PlanStageStats(common, stageType); - if (specific.get()) { - stats->specific.reset(specific->clone()); - } - for (size_t i = 0; i < children.size(); ++i) { - invariant(children[i]); - stats->children.push_back(children[i]->clone()); - } - return stats; + /** + * Make a deep copy. + */ + PlanStageStats* clone() const { + PlanStageStats* stats = new PlanStageStats(common, stageType); + if (specific.get()) { + stats->specific.reset(specific->clone()); } + for (size_t i = 0; i < children.size(); ++i) { + invariant(children[i]); + stats->children.push_back(children[i]->clone()); + } + return stats; + } - // See query/stage_type.h - StageType stageType; + // See query/stage_type.h + StageType stageType; - // Stats exported by implementing the PlanStage interface. - CommonStats common; + // Stats exported by implementing the PlanStage interface. + CommonStats common; - // Per-stage place to stash additional information - std::unique_ptr<SpecificStats> specific; + // Per-stage place to stash additional information + std::unique_ptr<SpecificStats> specific; - // The stats of the node's children. - std::vector<PlanStageStats*> children; + // The stats of the node's children. + std::vector<PlanStageStats*> children; - private: - MONGO_DISALLOW_COPYING(PlanStageStats); - }; +private: + MONGO_DISALLOW_COPYING(PlanStageStats); +}; - struct AndHashStats : public SpecificStats { - AndHashStats() : flaggedButPassed(0), - flaggedInProgress(0), - memUsage(0), - memLimit(0) { } +struct AndHashStats : public SpecificStats { + AndHashStats() : flaggedButPassed(0), flaggedInProgress(0), memUsage(0), memLimit(0) {} - virtual ~AndHashStats() { } + virtual ~AndHashStats() {} - virtual SpecificStats* clone() const { - AndHashStats* specific = new AndHashStats(*this); - return specific; - } + virtual SpecificStats* clone() const { + AndHashStats* specific = new AndHashStats(*this); + return specific; + } - // Invalidation counters. - // How many results had the AND fully evaluated but were invalidated? - size_t flaggedButPassed; + // Invalidation counters. + // How many results had the AND fully evaluated but were invalidated? + size_t flaggedButPassed; - // How many results were mid-AND but got flagged? - size_t flaggedInProgress; + // How many results were mid-AND but got flagged? + size_t flaggedInProgress; - // How many entries are in the map after each child? - // child 'i' produced children[i].common.advanced RecordIds, of which mapAfterChild[i] were - // intersections. - std::vector<size_t> mapAfterChild; + // How many entries are in the map after each child? + // child 'i' produced children[i].common.advanced RecordIds, of which mapAfterChild[i] were + // intersections. + std::vector<size_t> mapAfterChild; - // mapAfterChild[mapAfterChild.size() - 1] WSMswere match tested. - // commonstats.advanced is how many passed. + // mapAfterChild[mapAfterChild.size() - 1] WSMswere match tested. + // commonstats.advanced is how many passed. - // What's our current memory usage? - size_t memUsage; + // What's our current memory usage? + size_t memUsage; - // What's our memory limit? - size_t memLimit; - }; + // What's our memory limit? + size_t memLimit; +}; - struct AndSortedStats : public SpecificStats { - AndSortedStats() : flagged(0) { } +struct AndSortedStats : public SpecificStats { + AndSortedStats() : flagged(0) {} - virtual ~AndSortedStats() { } + virtual ~AndSortedStats() {} - virtual SpecificStats* clone() const { - AndSortedStats* specific = new AndSortedStats(*this); - return specific; - } + virtual SpecificStats* clone() const { + AndSortedStats* specific = new AndSortedStats(*this); + return specific; + } - // How many results from each child did not pass the AND? - std::vector<size_t> failedAnd; + // How many results from each child did not pass the AND? + std::vector<size_t> failedAnd; - // How many results were flagged via invalidation? - size_t flagged; - }; + // How many results were flagged via invalidation? + size_t flagged; +}; - struct CachedPlanStats : public SpecificStats { - CachedPlanStats() { } +struct CachedPlanStats : public SpecificStats { + CachedPlanStats() {} - virtual SpecificStats* clone() const { - return new CachedPlanStats(*this); - } - }; + virtual SpecificStats* clone() const { + return new CachedPlanStats(*this); + } +}; - struct CollectionScanStats : public SpecificStats { - CollectionScanStats() : docsTested(0), direction(1) { } +struct CollectionScanStats : public SpecificStats { + CollectionScanStats() : docsTested(0), direction(1) {} - virtual SpecificStats* clone() const { - CollectionScanStats* specific = new CollectionScanStats(*this); - return specific; - } + virtual SpecificStats* clone() const { + CollectionScanStats* specific = new CollectionScanStats(*this); + return specific; + } - // How many documents did we check against our filter? - size_t docsTested; + // How many documents did we check against our filter? + size_t docsTested; - // >0 if we're traversing the collection forwards. <0 if we're traversing it - // backwards. - int direction; - }; + // >0 if we're traversing the collection forwards. <0 if we're traversing it + // backwards. + int direction; +}; - struct CountStats : public SpecificStats { - CountStats() : nCounted(0), nSkipped(0), trivialCount(false) { } +struct CountStats : public SpecificStats { + CountStats() : nCounted(0), nSkipped(0), trivialCount(false) {} - virtual SpecificStats* clone() const { - CountStats* specific = new CountStats(*this); - return specific; - } + virtual SpecificStats* clone() const { + CountStats* specific = new CountStats(*this); + return specific; + } - // The result of the count. - long long nCounted; + // The result of the count. + long long nCounted; - // The number of results we skipped over. - long long nSkipped; + // The number of results we skipped over. + long long nSkipped; - // A "trivial count" is one that we can answer by calling numRecords() on the - // collection, without actually going through any query logic. - bool trivialCount; - }; + // A "trivial count" is one that we can answer by calling numRecords() on the + // collection, without actually going through any query logic. + bool trivialCount; +}; - struct CountScanStats : public SpecificStats { - CountScanStats() : indexVersion(0), - isMultiKey(false), - isPartial(false), - isSparse(false), - isUnique(false), - keysExamined(0) { } +struct CountScanStats : public SpecificStats { + CountScanStats() + : indexVersion(0), + isMultiKey(false), + isPartial(false), + isSparse(false), + isUnique(false), + keysExamined(0) {} - virtual ~CountScanStats() { } + virtual ~CountScanStats() {} - virtual SpecificStats* clone() const { - CountScanStats* specific = new CountScanStats(*this); - // BSON objects have to be explicitly copied. - specific->keyPattern = keyPattern.getOwned(); - return specific; - } + virtual SpecificStats* clone() const { + CountScanStats* specific = new CountScanStats(*this); + // BSON objects have to be explicitly copied. + specific->keyPattern = keyPattern.getOwned(); + return specific; + } - std::string indexName; + std::string indexName; - BSONObj keyPattern; + BSONObj keyPattern; - int indexVersion; + int indexVersion; - bool isMultiKey; - bool isPartial; - bool isSparse; - bool isUnique; + bool isMultiKey; + bool isPartial; + bool isSparse; + bool isUnique; - size_t keysExamined; + size_t keysExamined; +}; - }; +struct DeleteStats : public SpecificStats { + DeleteStats() : docsDeleted(0), nInvalidateSkips(0) {} - struct DeleteStats : public SpecificStats { - DeleteStats() : docsDeleted(0), nInvalidateSkips(0) { } + virtual SpecificStats* clone() const { + return new DeleteStats(*this); + } - virtual SpecificStats* clone() const { - return new DeleteStats(*this); - } + size_t docsDeleted; - size_t docsDeleted; + // Invalidated documents can be force-fetched, causing the now invalid RecordId to + // be thrown out. The delete stage skips over any results which do not have a RecordId. + size_t nInvalidateSkips; +}; - // Invalidated documents can be force-fetched, causing the now invalid RecordId to - // be thrown out. The delete stage skips over any results which do not have a RecordId. - size_t nInvalidateSkips; - }; +struct DistinctScanStats : public SpecificStats { + DistinctScanStats() : keysExamined(0), indexVersion(0) {} - struct DistinctScanStats : public SpecificStats { - DistinctScanStats() : keysExamined(0), indexVersion(0) { } + virtual SpecificStats* clone() const { + DistinctScanStats* specific = new DistinctScanStats(*this); + specific->keyPattern = keyPattern.getOwned(); + return specific; + } - virtual SpecificStats* clone() const { - DistinctScanStats* specific = new DistinctScanStats(*this); - specific->keyPattern = keyPattern.getOwned(); - return specific; - } + // How many keys did we look at while distinct-ing? + size_t keysExamined; - // How many keys did we look at while distinct-ing? - size_t keysExamined; + std::string indexName; - std::string indexName; + BSONObj keyPattern; - BSONObj keyPattern; + int indexVersion; +}; - int indexVersion; - }; +struct FetchStats : public SpecificStats { + FetchStats() : alreadyHasObj(0), forcedFetches(0), docsExamined(0) {} - struct FetchStats : public SpecificStats { - FetchStats() : alreadyHasObj(0), - forcedFetches(0), - docsExamined(0) { } + virtual ~FetchStats() {} - virtual ~FetchStats() { } + virtual SpecificStats* clone() const { + FetchStats* specific = new FetchStats(*this); + return specific; + } - virtual SpecificStats* clone() const { - FetchStats* specific = new FetchStats(*this); - return specific; - } + // Have we seen anything that already had an object? + size_t alreadyHasObj; - // Have we seen anything that already had an object? - size_t alreadyHasObj; + // How many records were we forced to fetch as the result of an invalidation? + size_t forcedFetches; - // How many records were we forced to fetch as the result of an invalidation? - size_t forcedFetches; + // The total number of full documents touched by the fetch stage. + size_t docsExamined; +}; - // The total number of full documents touched by the fetch stage. - size_t docsExamined; - }; +struct GroupStats : public SpecificStats { + GroupStats() : nGroups(0) {} - struct GroupStats : public SpecificStats { - GroupStats() : nGroups(0) { } + virtual ~GroupStats() {} - virtual ~GroupStats() { } + virtual SpecificStats* clone() const { + GroupStats* specific = new GroupStats(*this); + return specific; + } - virtual SpecificStats* clone() const { - GroupStats* specific = new GroupStats(*this); - return specific; - } + // The total number of groups. + size_t nGroups; +}; - // The total number of groups. - size_t nGroups; - }; +struct IDHackStats : public SpecificStats { + IDHackStats() : keysExamined(0), docsExamined(0) {} - struct IDHackStats : public SpecificStats { - IDHackStats() : keysExamined(0), - docsExamined(0) { } + virtual ~IDHackStats() {} - virtual ~IDHackStats() { } + virtual SpecificStats* clone() const { + IDHackStats* specific = new IDHackStats(*this); + return specific; + } - virtual SpecificStats* clone() const { - IDHackStats* specific = new IDHackStats(*this); - return specific; - } + // Number of entries retrieved from the index while executing the idhack. + size_t keysExamined; - // Number of entries retrieved from the index while executing the idhack. - size_t keysExamined; - - // Number of documents retrieved from the collection while executing the idhack. - size_t docsExamined; - - }; - - struct IndexScanStats : public SpecificStats { - IndexScanStats() : indexVersion(0), - direction(1), - isMultiKey(false), - isPartial(false), - isSparse(false), - isUnique(false), - dupsTested(0), - dupsDropped(0), - seenInvalidated(0), - keysExamined(0) { } - - virtual ~IndexScanStats() { } - - virtual SpecificStats* clone() const { - IndexScanStats* specific = new IndexScanStats(*this); - // BSON objects have to be explicitly copied. - specific->keyPattern = keyPattern.getOwned(); - specific->indexBounds = indexBounds.getOwned(); - return specific; - } + // Number of documents retrieved from the collection while executing the idhack. + size_t docsExamined; +}; - // Index type being used. - std::string indexType; +struct IndexScanStats : public SpecificStats { + IndexScanStats() + : indexVersion(0), + direction(1), + isMultiKey(false), + isPartial(false), + isSparse(false), + isUnique(false), + dupsTested(0), + dupsDropped(0), + seenInvalidated(0), + keysExamined(0) {} - // name of the index being used - std::string indexName; + virtual ~IndexScanStats() {} - BSONObj keyPattern; + virtual SpecificStats* clone() const { + IndexScanStats* specific = new IndexScanStats(*this); + // BSON objects have to be explicitly copied. + specific->keyPattern = keyPattern.getOwned(); + specific->indexBounds = indexBounds.getOwned(); + return specific; + } - int indexVersion; + // Index type being used. + std::string indexType; - // A BSON (opaque, ie. hands off other than toString() it) representation of the bounds - // used. - BSONObj indexBounds; + // name of the index being used + std::string indexName; - // >1 if we're traversing the index along with its order. <1 if we're traversing it - // against the order. - int direction; + BSONObj keyPattern; - // index properties - // Whether this index is over a field that contain array values. - bool isMultiKey; - bool isPartial; - bool isSparse; - bool isUnique; + int indexVersion; - size_t dupsTested; - size_t dupsDropped; + // A BSON (opaque, ie. hands off other than toString() it) representation of the bounds + // used. + BSONObj indexBounds; - size_t seenInvalidated; - // TODO: we could track key sizes here. + // >1 if we're traversing the index along with its order. <1 if we're traversing it + // against the order. + int direction; - // Number of entries retrieved from the index during the scan. - size_t keysExamined; + // index properties + // Whether this index is over a field that contain array values. + bool isMultiKey; + bool isPartial; + bool isSparse; + bool isUnique; - }; + size_t dupsTested; + size_t dupsDropped; - struct LimitStats : public SpecificStats { - LimitStats() : limit(0) { } + size_t seenInvalidated; + // TODO: we could track key sizes here. - virtual SpecificStats* clone() const { - LimitStats* specific = new LimitStats(*this); - return specific; - } + // Number of entries retrieved from the index during the scan. + size_t keysExamined; +}; - size_t limit; - }; +struct LimitStats : public SpecificStats { + LimitStats() : limit(0) {} - struct MockStats : public SpecificStats { - MockStats() { } + virtual SpecificStats* clone() const { + LimitStats* specific = new LimitStats(*this); + return specific; + } - virtual SpecificStats* clone() const { - return new MockStats(*this); - } - }; + size_t limit; +}; - struct MultiPlanStats : public SpecificStats { - MultiPlanStats() { } +struct MockStats : public SpecificStats { + MockStats() {} - virtual SpecificStats* clone() const { - return new MultiPlanStats(*this); - } - }; + virtual SpecificStats* clone() const { + return new MockStats(*this); + } +}; - struct OrStats : public SpecificStats { - OrStats() : dupsTested(0), - dupsDropped(0), - locsForgotten(0) { } +struct MultiPlanStats : public SpecificStats { + MultiPlanStats() {} - virtual ~OrStats() { } + virtual SpecificStats* clone() const { + return new MultiPlanStats(*this); + } +}; - virtual SpecificStats* clone() const { - OrStats* specific = new OrStats(*this); - return specific; - } +struct OrStats : public SpecificStats { + OrStats() : dupsTested(0), dupsDropped(0), locsForgotten(0) {} - size_t dupsTested; - size_t dupsDropped; + virtual ~OrStats() {} - // How many calls to invalidate(...) actually removed a RecordId from our deduping map? - size_t locsForgotten; - }; + virtual SpecificStats* clone() const { + OrStats* specific = new OrStats(*this); + return specific; + } - struct ProjectionStats : public SpecificStats { - ProjectionStats() { } + size_t dupsTested; + size_t dupsDropped; - virtual SpecificStats* clone() const { - ProjectionStats* specific = new ProjectionStats(*this); - return specific; - } + // How many calls to invalidate(...) actually removed a RecordId from our deduping map? + size_t locsForgotten; +}; - // Object specifying the projection transformation to apply. - BSONObj projObj; - }; +struct ProjectionStats : public SpecificStats { + ProjectionStats() {} - struct SortStats : public SpecificStats { - SortStats() : forcedFetches(0), memUsage(0), memLimit(0) { } + virtual SpecificStats* clone() const { + ProjectionStats* specific = new ProjectionStats(*this); + return specific; + } - virtual ~SortStats() { } + // Object specifying the projection transformation to apply. + BSONObj projObj; +}; - virtual SpecificStats* clone() const { - SortStats* specific = new SortStats(*this); - return specific; - } +struct SortStats : public SpecificStats { + SortStats() : forcedFetches(0), memUsage(0), memLimit(0) {} - // How many records were we forced to fetch as the result of an invalidation? - size_t forcedFetches; + virtual ~SortStats() {} - // What's our current memory usage? - size_t memUsage; + virtual SpecificStats* clone() const { + SortStats* specific = new SortStats(*this); + return specific; + } - // What's our memory limit? - size_t memLimit; + // How many records were we forced to fetch as the result of an invalidation? + size_t forcedFetches; - // The number of results to return from the sort. - size_t limit; + // What's our current memory usage? + size_t memUsage; - // The pattern according to which we are sorting. - BSONObj sortPattern; - }; + // What's our memory limit? + size_t memLimit; - struct MergeSortStats : public SpecificStats { - MergeSortStats() : dupsTested(0), - dupsDropped(0), - forcedFetches(0) { } + // The number of results to return from the sort. + size_t limit; - virtual ~MergeSortStats() { } + // The pattern according to which we are sorting. + BSONObj sortPattern; +}; - virtual SpecificStats* clone() const { - MergeSortStats* specific = new MergeSortStats(*this); - return specific; - } +struct MergeSortStats : public SpecificStats { + MergeSortStats() : dupsTested(0), dupsDropped(0), forcedFetches(0) {} - size_t dupsTested; - size_t dupsDropped; + virtual ~MergeSortStats() {} - // How many records were we forced to fetch as the result of an invalidation? - size_t forcedFetches; + virtual SpecificStats* clone() const { + MergeSortStats* specific = new MergeSortStats(*this); + return specific; + } - // The pattern according to which we are sorting. - BSONObj sortPattern; - }; + size_t dupsTested; + size_t dupsDropped; - struct ShardingFilterStats : public SpecificStats { - ShardingFilterStats() : chunkSkips(0) { } + // How many records were we forced to fetch as the result of an invalidation? + size_t forcedFetches; - virtual SpecificStats* clone() const { - ShardingFilterStats* specific = new ShardingFilterStats(*this); - return specific; - } + // The pattern according to which we are sorting. + BSONObj sortPattern; +}; - size_t chunkSkips; - }; +struct ShardingFilterStats : public SpecificStats { + ShardingFilterStats() : chunkSkips(0) {} - struct SkipStats : public SpecificStats { - SkipStats() : skip(0) { } + virtual SpecificStats* clone() const { + ShardingFilterStats* specific = new ShardingFilterStats(*this); + return specific; + } - virtual SpecificStats* clone() const { - SkipStats* specific = new SkipStats(*this); - return specific; - } + size_t chunkSkips; +}; - size_t skip; - }; - - struct IntervalStats { - - IntervalStats() : - numResultsFound(0), - numResultsBuffered(0), - minDistanceAllowed(-1), - maxDistanceAllowed(-1), - inclusiveMaxDistanceAllowed(false), - minDistanceFound(-1), - maxDistanceFound(-1), - minDistanceBuffered(-1), - maxDistanceBuffered(-1) { - } +struct SkipStats : public SpecificStats { + SkipStats() : skip(0) {} - long long numResultsFound; - long long numResultsBuffered; + virtual SpecificStats* clone() const { + SkipStats* specific = new SkipStats(*this); + return specific; + } - double minDistanceAllowed; - double maxDistanceAllowed; - bool inclusiveMaxDistanceAllowed; + size_t skip; +}; - double minDistanceFound; - double maxDistanceFound; - double minDistanceBuffered; - double maxDistanceBuffered; - }; +struct IntervalStats { + IntervalStats() + : numResultsFound(0), + numResultsBuffered(0), + minDistanceAllowed(-1), + maxDistanceAllowed(-1), + inclusiveMaxDistanceAllowed(false), + minDistanceFound(-1), + maxDistanceFound(-1), + minDistanceBuffered(-1), + maxDistanceBuffered(-1) {} - class NearStats : public SpecificStats { - public: + long long numResultsFound; + long long numResultsBuffered; - NearStats() {} + double minDistanceAllowed; + double maxDistanceAllowed; + bool inclusiveMaxDistanceAllowed; - virtual SpecificStats* clone() const { - return new NearStats(*this); - } + double minDistanceFound; + double maxDistanceFound; + double minDistanceBuffered; + double maxDistanceBuffered; +}; - long long totalResultsFound() { - long long totalResultsFound = 0; - for (std::vector<IntervalStats>::iterator it = intervalStats.begin(); - it != intervalStats.end(); ++it) { - totalResultsFound += it->numResultsFound; - } - return totalResultsFound; - } +class NearStats : public SpecificStats { +public: + NearStats() {} + + virtual SpecificStats* clone() const { + return new NearStats(*this); + } - std::vector<IntervalStats> intervalStats; - std::string indexName; - BSONObj keyPattern; - }; - - struct UpdateStats : public SpecificStats { - UpdateStats() - : nMatched(0), - nModified(0), - isDocReplacement(false), - fastmod(false), - fastmodinsert(false), - inserted(false), - nInvalidateSkips(0) { } - - virtual SpecificStats* clone() const { - return new UpdateStats(*this); + long long totalResultsFound() { + long long totalResultsFound = 0; + for (std::vector<IntervalStats>::iterator it = intervalStats.begin(); + it != intervalStats.end(); + ++it) { + totalResultsFound += it->numResultsFound; } + return totalResultsFound; + } - // The number of documents which match the query part of the update. - size_t nMatched; + std::vector<IntervalStats> intervalStats; + std::string indexName; + BSONObj keyPattern; +}; - // The number of documents modified by this update. - size_t nModified; +struct UpdateStats : public SpecificStats { + UpdateStats() + : nMatched(0), + nModified(0), + isDocReplacement(false), + fastmod(false), + fastmodinsert(false), + inserted(false), + nInvalidateSkips(0) {} - // True iff this is a doc-replacement style update, as opposed to a $mod update. - bool isDocReplacement; + virtual SpecificStats* clone() const { + return new UpdateStats(*this); + } - // A 'fastmod' update is an in-place update that does not have to modify - // any indices. It's "fast" because the only work needed is changing the bits - // inside the document. - bool fastmod; + // The number of documents which match the query part of the update. + size_t nMatched; - // A 'fastmodinsert' is an insert resulting from an {upsert: true} update - // which is a doc-replacement style update. It's "fast" because we don't need - // to compute the document to insert based on the modifiers. - bool fastmodinsert; + // The number of documents modified by this update. + size_t nModified; - // Is this an {upsert: true} update that did an insert? - bool inserted; + // True iff this is a doc-replacement style update, as opposed to a $mod update. + bool isDocReplacement; - // The object that was inserted. This is an empty document if no insert was performed. - BSONObj objInserted; + // A 'fastmod' update is an in-place update that does not have to modify + // any indices. It's "fast" because the only work needed is changing the bits + // inside the document. + bool fastmod; - // Invalidated documents can be force-fetched, causing the now invalid RecordId to - // be thrown out. The update stage skips over any results which do not have the - // RecordId to update. - size_t nInvalidateSkips; - }; + // A 'fastmodinsert' is an insert resulting from an {upsert: true} update + // which is a doc-replacement style update. It's "fast" because we don't need + // to compute the document to insert based on the modifiers. + bool fastmodinsert; - struct TextStats : public SpecificStats { - TextStats() : keysExamined(0), fetches(0), parsedTextQuery() { } + // Is this an {upsert: true} update that did an insert? + bool inserted; - virtual SpecificStats* clone() const { - TextStats* specific = new TextStats(*this); - return specific; - } + // The object that was inserted. This is an empty document if no insert was performed. + BSONObj objInserted; + + // Invalidated documents can be force-fetched, causing the now invalid RecordId to + // be thrown out. The update stage skips over any results which do not have the + // RecordId to update. + size_t nInvalidateSkips; +}; + +struct TextStats : public SpecificStats { + TextStats() : keysExamined(0), fetches(0), parsedTextQuery() {} + + virtual SpecificStats* clone() const { + TextStats* specific = new TextStats(*this); + return specific; + } - std::string indexName; + std::string indexName; - size_t keysExamined; + size_t keysExamined; - size_t fetches; + size_t fetches; - // Human-readable form of the FTSQuery associated with the text stage. - BSONObj parsedTextQuery; + // Human-readable form of the FTSQuery associated with the text stage. + BSONObj parsedTextQuery; - // Index keys that precede the "text" index key. - BSONObj indexPrefix; - }; + // Index keys that precede the "text" index key. + BSONObj indexPrefix; +}; } // namespace mongo diff --git a/src/mongo/db/exec/plan_stats_test.cpp b/src/mongo/db/exec/plan_stats_test.cpp index 805401ea9a5..02152308d12 100644 --- a/src/mongo/db/exec/plan_stats_test.cpp +++ b/src/mongo/db/exec/plan_stats_test.cpp @@ -38,70 +38,70 @@ using namespace mongo; namespace { - /** - * Basic test on field initializers - */ - TEST(CommonStatsTest, defaultValues) { - CommonStats stats; - ASSERT_EQUALS(stats.works, static_cast<size_t>(0)); - ASSERT_EQUALS(stats.yields, static_cast<size_t>(0)); - ASSERT_EQUALS(stats.invalidates, static_cast<size_t>(0)); - ASSERT_EQUALS(stats.advanced, static_cast<size_t>(0)); - ASSERT_EQUALS(stats.needTime, static_cast<size_t>(0)); - ASSERT_EQUALS(stats.needYield, static_cast<size_t>(0)); - ASSERT_FALSE(stats.isEOF); - } +/** + * Basic test on field initializers + */ +TEST(CommonStatsTest, defaultValues) { + CommonStats stats; + ASSERT_EQUALS(stats.works, static_cast<size_t>(0)); + ASSERT_EQUALS(stats.yields, static_cast<size_t>(0)); + ASSERT_EQUALS(stats.invalidates, static_cast<size_t>(0)); + ASSERT_EQUALS(stats.advanced, static_cast<size_t>(0)); + ASSERT_EQUALS(stats.needTime, static_cast<size_t>(0)); + ASSERT_EQUALS(stats.needYield, static_cast<size_t>(0)); + ASSERT_FALSE(stats.isEOF); +} - /** - * Verifies null argument check in CommonStats::writeExplainTo - */ - TEST(CommonStatsTest, writeExplainToNullBuilder) { - CommonStats stats; - stats.writeExplainTo(NULL); - } +/** + * Verifies null argument check in CommonStats::writeExplainTo + */ +TEST(CommonStatsTest, writeExplainToNullBuilder) { + CommonStats stats; + stats.writeExplainTo(NULL); +} - /** - * Verifies null argument check in PlanStageStats::writeExplainTo - */ - TEST(PlanStageStatsTest, writeExplainToNullBuilder) { - CommonStats stats; - PlanStageStats pss(stats); - pss.writeExplainTo(NULL); - } +/** + * Verifies null argument check in PlanStageStats::writeExplainTo + */ +TEST(PlanStageStatsTest, writeExplainToNullBuilder) { + CommonStats stats; + PlanStageStats pss(stats); + pss.writeExplainTo(NULL); +} - /** - * Checks BSON output of CommonStats::writeExplainTo to ensure it contains - * correct values for CommonStats fields - */ - TEST(CommonStatsTest, writeExplainTo) { - CommonStats stats; - stats.works = static_cast<size_t>(2); - stats.advanced = static_cast<size_t>(3); - BSONObjBuilder bob; - stats.writeExplainTo(&bob); - BSONObj obj = bob.done(); - ASSERT_TRUE(obj.hasField("works")); - ASSERT_EQUALS(obj.getIntField("works"), 2); - ASSERT_TRUE(obj.hasField("advanced")); - ASSERT_EQUALS(obj.getIntField("advanced"), 3); - } +/** + * Checks BSON output of CommonStats::writeExplainTo to ensure it contains + * correct values for CommonStats fields + */ +TEST(CommonStatsTest, writeExplainTo) { + CommonStats stats; + stats.works = static_cast<size_t>(2); + stats.advanced = static_cast<size_t>(3); + BSONObjBuilder bob; + stats.writeExplainTo(&bob); + BSONObj obj = bob.done(); + ASSERT_TRUE(obj.hasField("works")); + ASSERT_EQUALS(obj.getIntField("works"), 2); + ASSERT_TRUE(obj.hasField("advanced")); + ASSERT_EQUALS(obj.getIntField("advanced"), 3); +} - /** - * Checks BSON output of PlanStageStats::writeExplainTo to ensure it contains - * correct values for CommonStats fields - */ - TEST(PlanStageStatsTest, writeExplainTo) { - CommonStats stats; - stats.works = static_cast<size_t>(2); - stats.advanced = static_cast<size_t>(3); - BSONObjBuilder bob; - PlanStageStats pss(stats); - pss.writeExplainTo(&bob); - BSONObj obj = bob.done(); - ASSERT_TRUE(obj.hasField("works")); - ASSERT_EQUALS(obj.getIntField("works"), 2); - ASSERT_TRUE(obj.hasField("advanced")); - ASSERT_EQUALS(obj.getIntField("advanced"), 3); - } +/** + * Checks BSON output of PlanStageStats::writeExplainTo to ensure it contains + * correct values for CommonStats fields + */ +TEST(PlanStageStatsTest, writeExplainTo) { + CommonStats stats; + stats.works = static_cast<size_t>(2); + stats.advanced = static_cast<size_t>(3); + BSONObjBuilder bob; + PlanStageStats pss(stats); + pss.writeExplainTo(&bob); + BSONObj obj = bob.done(); + ASSERT_TRUE(obj.hasField("works")); + ASSERT_EQUALS(obj.getIntField("works"), 2); + ASSERT_TRUE(obj.hasField("advanced")); + ASSERT_EQUALS(obj.getIntField("advanced"), 3); +} } // namespace diff --git a/src/mongo/db/exec/projection.cpp b/src/mongo/db/exec/projection.cpp index 15d06963b31..65ab6a1323c 100644 --- a/src/mongo/db/exec/projection.cpp +++ b/src/mongo/db/exec/projection.cpp @@ -41,256 +41,241 @@ namespace mongo { - using std::unique_ptr; - using std::endl; - using std::vector; - - static const char* kIdField = "_id"; - - // static - const char* ProjectionStage::kStageType = "PROJECTION"; - - ProjectionStage::ProjectionStage(const ProjectionStageParams& params, - WorkingSet* ws, - PlanStage* child) - : _ws(ws), - _child(child), - _commonStats(kStageType), - _projImpl(params.projImpl) { - - _projObj = params.projObj; - - if (ProjectionStageParams::NO_FAST_PATH == _projImpl) { - _exec.reset(new ProjectionExec(params.projObj, - params.fullExpression, - *params.whereCallback)); - } - else { - // We shouldn't need the full expression if we're fast-pathing. - invariant(NULL == params.fullExpression); - - // Sanity-check the input. - invariant(_projObj.isOwned()); - invariant(!_projObj.isEmpty()); - - // Figure out what fields are in the projection. - getSimpleInclusionFields(_projObj, &_includedFields); - - // If we're pulling data out of one index we can pre-compute the indices of the fields - // in the key that we pull data from and avoid looking up the field name each time. - if (ProjectionStageParams::COVERED_ONE_INDEX == params.projImpl) { - // Sanity-check. - _coveredKeyObj = params.coveredKeyObj; - invariant(_coveredKeyObj.isOwned()); - - BSONObjIterator kpIt(_coveredKeyObj); - while (kpIt.more()) { - BSONElement elt = kpIt.next(); - unordered_set<StringData, StringData::Hasher>::iterator fieldIt; - fieldIt = _includedFields.find(elt.fieldNameStringData()); - - if (_includedFields.end() == fieldIt) { - // Push an unused value on the back to keep _includeKey and _keyFieldNames - // in sync. - _keyFieldNames.push_back(StringData()); - _includeKey.push_back(false); - } - else { - // If we are including this key field store its field name. - _keyFieldNames.push_back(*fieldIt); - _includeKey.push_back(true); - } +using std::unique_ptr; +using std::endl; +using std::vector; + +static const char* kIdField = "_id"; + +// static +const char* ProjectionStage::kStageType = "PROJECTION"; + +ProjectionStage::ProjectionStage(const ProjectionStageParams& params, + WorkingSet* ws, + PlanStage* child) + : _ws(ws), _child(child), _commonStats(kStageType), _projImpl(params.projImpl) { + _projObj = params.projObj; + + if (ProjectionStageParams::NO_FAST_PATH == _projImpl) { + _exec.reset( + new ProjectionExec(params.projObj, params.fullExpression, *params.whereCallback)); + } else { + // We shouldn't need the full expression if we're fast-pathing. + invariant(NULL == params.fullExpression); + + // Sanity-check the input. + invariant(_projObj.isOwned()); + invariant(!_projObj.isEmpty()); + + // Figure out what fields are in the projection. + getSimpleInclusionFields(_projObj, &_includedFields); + + // If we're pulling data out of one index we can pre-compute the indices of the fields + // in the key that we pull data from and avoid looking up the field name each time. + if (ProjectionStageParams::COVERED_ONE_INDEX == params.projImpl) { + // Sanity-check. + _coveredKeyObj = params.coveredKeyObj; + invariant(_coveredKeyObj.isOwned()); + + BSONObjIterator kpIt(_coveredKeyObj); + while (kpIt.more()) { + BSONElement elt = kpIt.next(); + unordered_set<StringData, StringData::Hasher>::iterator fieldIt; + fieldIt = _includedFields.find(elt.fieldNameStringData()); + + if (_includedFields.end() == fieldIt) { + // Push an unused value on the back to keep _includeKey and _keyFieldNames + // in sync. + _keyFieldNames.push_back(StringData()); + _includeKey.push_back(false); + } else { + // If we are including this key field store its field name. + _keyFieldNames.push_back(*fieldIt); + _includeKey.push_back(true); } } - else { - invariant(ProjectionStageParams::SIMPLE_DOC == params.projImpl); - } + } else { + invariant(ProjectionStageParams::SIMPLE_DOC == params.projImpl); } } - - // static - void ProjectionStage::getSimpleInclusionFields(const BSONObj& projObj, - FieldSet* includedFields) { - // The _id is included by default. - bool includeId = true; - - // Figure out what fields are in the projection. TODO: we can get this from the - // ParsedProjection...modify that to have this type instead of a vector. - BSONObjIterator projObjIt(projObj); - while (projObjIt.more()) { - BSONElement elt = projObjIt.next(); - // Must deal with the _id case separately as there is an implicit _id: 1 in the - // projection. - if (mongoutils::str::equals(elt.fieldName(), kIdField) - && !elt.trueValue()) { - includeId = false; - continue; - } - includedFields->insert(elt.fieldNameStringData()); - } - - if (includeId) { - includedFields->insert(kIdField); +} + +// static +void ProjectionStage::getSimpleInclusionFields(const BSONObj& projObj, FieldSet* includedFields) { + // The _id is included by default. + bool includeId = true; + + // Figure out what fields are in the projection. TODO: we can get this from the + // ParsedProjection...modify that to have this type instead of a vector. + BSONObjIterator projObjIt(projObj); + while (projObjIt.more()) { + BSONElement elt = projObjIt.next(); + // Must deal with the _id case separately as there is an implicit _id: 1 in the + // projection. + if (mongoutils::str::equals(elt.fieldName(), kIdField) && !elt.trueValue()) { + includeId = false; + continue; } + includedFields->insert(elt.fieldNameStringData()); } - // static - void ProjectionStage::transformSimpleInclusion(const BSONObj& in, - const FieldSet& includedFields, - BSONObjBuilder& bob) { - // Look at every field in the source document and see if we're including it. - BSONObjIterator inputIt(in); - while (inputIt.more()) { - BSONElement elt = inputIt.next(); - unordered_set<StringData, StringData::Hasher>::const_iterator fieldIt; - fieldIt = includedFields.find(elt.fieldNameStringData()); - if (includedFields.end() != fieldIt) { - // If so, add it to the builder. - bob.append(elt); - } - } + if (includeId) { + includedFields->insert(kIdField); } - - Status ProjectionStage::transform(WorkingSetMember* member) { - // The default no-fast-path case. - if (ProjectionStageParams::NO_FAST_PATH == _projImpl) { - return _exec->transform(member); - } - - BSONObjBuilder bob; - - // Note that even if our fast path analysis is bug-free something that is - // covered might be invalidated and just be an obj. In this case we just go - // through the SIMPLE_DOC path which is still correct if the covered data - // is not available. - // - // SIMPLE_DOC implies that we expect an object so it's kind of redundant. - if ((ProjectionStageParams::SIMPLE_DOC == _projImpl) || member->hasObj()) { - // If we got here because of SIMPLE_DOC the planner shouldn't have messed up. - invariant(member->hasObj()); - - // Apply the SIMPLE_DOC projection. - transformSimpleInclusion(member->obj.value(), _includedFields, bob); +} + +// static +void ProjectionStage::transformSimpleInclusion(const BSONObj& in, + const FieldSet& includedFields, + BSONObjBuilder& bob) { + // Look at every field in the source document and see if we're including it. + BSONObjIterator inputIt(in); + while (inputIt.more()) { + BSONElement elt = inputIt.next(); + unordered_set<StringData, StringData::Hasher>::const_iterator fieldIt; + fieldIt = includedFields.find(elt.fieldNameStringData()); + if (includedFields.end() != fieldIt) { + // If so, add it to the builder. + bob.append(elt); } - else { - invariant(ProjectionStageParams::COVERED_ONE_INDEX == _projImpl); - // We're pulling data out of the key. - invariant(1 == member->keyData.size()); - size_t keyIndex = 0; - - // Look at every key element... - BSONObjIterator keyIterator(member->keyData[0].keyData); - while (keyIterator.more()) { - BSONElement elt = keyIterator.next(); - // If we're supposed to include it... - if (_includeKey[keyIndex]) { - // Do so. - bob.appendAs(elt, _keyFieldNames[keyIndex]); - } - ++keyIndex; - } - } - - member->state = WorkingSetMember::OWNED_OBJ; - member->keyData.clear(); - member->loc = RecordId(); - member->obj = Snapshotted<BSONObj>(SnapshotId(), bob.obj()); - return Status::OK(); } +} - ProjectionStage::~ProjectionStage() { } - - bool ProjectionStage::isEOF() { return _child->isEOF(); } - - PlanStage::StageState ProjectionStage::work(WorkingSetID* out) { - ++_commonStats.works; - - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); - - WorkingSetID id = WorkingSet::INVALID_ID; - StageState status = _child->work(&id); - - // Note that we don't do the normal if isEOF() return EOF thing here. Our child might be a - // tailable cursor and isEOF() would be true even if it had more data... - if (PlanStage::ADVANCED == status) { - WorkingSetMember* member = _ws->get(id); - // Punt to our specific projection impl. - Status projStatus = transform(member); - if (!projStatus.isOK()) { - warning() << "Couldn't execute projection, status = " - << projStatus.toString() << endl; - *out = WorkingSetCommon::allocateStatusMember(_ws, projStatus); - return PlanStage::FAILURE; - } +Status ProjectionStage::transform(WorkingSetMember* member) { + // The default no-fast-path case. + if (ProjectionStageParams::NO_FAST_PATH == _projImpl) { + return _exec->transform(member); + } - *out = id; - ++_commonStats.advanced; - } - else if (PlanStage::FAILURE == status || PlanStage::DEAD == status) { - *out = id; - // If a stage fails, it may create a status WSM to indicate why it - // failed, in which case 'id' is valid. If ID is invalid, we - // create our own error message. - if (WorkingSet::INVALID_ID == id) { - mongoutils::str::stream ss; - ss << "projection stage failed to read in results from child"; - Status status(ErrorCodes::InternalError, ss); - *out = WorkingSetCommon::allocateStatusMember( _ws, status); + BSONObjBuilder bob; + + // Note that even if our fast path analysis is bug-free something that is + // covered might be invalidated and just be an obj. In this case we just go + // through the SIMPLE_DOC path which is still correct if the covered data + // is not available. + // + // SIMPLE_DOC implies that we expect an object so it's kind of redundant. + if ((ProjectionStageParams::SIMPLE_DOC == _projImpl) || member->hasObj()) { + // If we got here because of SIMPLE_DOC the planner shouldn't have messed up. + invariant(member->hasObj()); + + // Apply the SIMPLE_DOC projection. + transformSimpleInclusion(member->obj.value(), _includedFields, bob); + } else { + invariant(ProjectionStageParams::COVERED_ONE_INDEX == _projImpl); + // We're pulling data out of the key. + invariant(1 == member->keyData.size()); + size_t keyIndex = 0; + + // Look at every key element... + BSONObjIterator keyIterator(member->keyData[0].keyData); + while (keyIterator.more()) { + BSONElement elt = keyIterator.next(); + // If we're supposed to include it... + if (_includeKey[keyIndex]) { + // Do so. + bob.appendAs(elt, _keyFieldNames[keyIndex]); } + ++keyIndex; } - else if (PlanStage::NEED_TIME == status) { - _commonStats.needTime++; - } - else if (PlanStage::NEED_YIELD == status) { - _commonStats.needYield++; - *out = id; - } - - return status; } - void ProjectionStage::saveState() { - ++_commonStats.yields; - _child->saveState(); - } - - void ProjectionStage::restoreState(OperationContext* opCtx) { - ++_commonStats.unyields; - _child->restoreState(opCtx); - } - - void ProjectionStage::invalidate(OperationContext* txn, - const RecordId& dl, - InvalidationType type) { - ++_commonStats.invalidates; - _child->invalidate(txn, dl, type); - } - - vector<PlanStage*> ProjectionStage::getChildren() const { - vector<PlanStage*> children; - children.push_back(_child.get()); - return children; - } - - PlanStageStats* ProjectionStage::getStats() { - _commonStats.isEOF = isEOF(); - unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_PROJECTION)); - - ProjectionStats* projStats = new ProjectionStats(_specificStats); - projStats->projObj = _projObj; - ret->specific.reset(projStats); - - ret->children.push_back(_child->getStats()); - return ret.release(); - } + member->state = WorkingSetMember::OWNED_OBJ; + member->keyData.clear(); + member->loc = RecordId(); + member->obj = Snapshotted<BSONObj>(SnapshotId(), bob.obj()); + return Status::OK(); +} + +ProjectionStage::~ProjectionStage() {} + +bool ProjectionStage::isEOF() { + return _child->isEOF(); +} + +PlanStage::StageState ProjectionStage::work(WorkingSetID* out) { + ++_commonStats.works; + + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); + + WorkingSetID id = WorkingSet::INVALID_ID; + StageState status = _child->work(&id); + + // Note that we don't do the normal if isEOF() return EOF thing here. Our child might be a + // tailable cursor and isEOF() would be true even if it had more data... + if (PlanStage::ADVANCED == status) { + WorkingSetMember* member = _ws->get(id); + // Punt to our specific projection impl. + Status projStatus = transform(member); + if (!projStatus.isOK()) { + warning() << "Couldn't execute projection, status = " << projStatus.toString() << endl; + *out = WorkingSetCommon::allocateStatusMember(_ws, projStatus); + return PlanStage::FAILURE; + } - const CommonStats* ProjectionStage::getCommonStats() const { - return &_commonStats; + *out = id; + ++_commonStats.advanced; + } else if (PlanStage::FAILURE == status || PlanStage::DEAD == status) { + *out = id; + // If a stage fails, it may create a status WSM to indicate why it + // failed, in which case 'id' is valid. If ID is invalid, we + // create our own error message. + if (WorkingSet::INVALID_ID == id) { + mongoutils::str::stream ss; + ss << "projection stage failed to read in results from child"; + Status status(ErrorCodes::InternalError, ss); + *out = WorkingSetCommon::allocateStatusMember(_ws, status); + } + } else if (PlanStage::NEED_TIME == status) { + _commonStats.needTime++; + } else if (PlanStage::NEED_YIELD == status) { + _commonStats.needYield++; + *out = id; } - const SpecificStats* ProjectionStage::getSpecificStats() const { - return &_specificStats; - } + return status; +} + +void ProjectionStage::saveState() { + ++_commonStats.yields; + _child->saveState(); +} + +void ProjectionStage::restoreState(OperationContext* opCtx) { + ++_commonStats.unyields; + _child->restoreState(opCtx); +} + +void ProjectionStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + ++_commonStats.invalidates; + _child->invalidate(txn, dl, type); +} + +vector<PlanStage*> ProjectionStage::getChildren() const { + vector<PlanStage*> children; + children.push_back(_child.get()); + return children; +} + +PlanStageStats* ProjectionStage::getStats() { + _commonStats.isEOF = isEOF(); + unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_PROJECTION)); + + ProjectionStats* projStats = new ProjectionStats(_specificStats); + projStats->projObj = _projObj; + ret->specific.reset(projStats); + + ret->children.push_back(_child->getStats()); + return ret.release(); +} + +const CommonStats* ProjectionStage::getCommonStats() const { + return &_commonStats; +} + +const SpecificStats* ProjectionStage::getSpecificStats() const { + return &_specificStats; +} } // namespace mongo diff --git a/src/mongo/db/exec/projection.h b/src/mongo/db/exec/projection.h index d70967eebe1..b09ef956cd2 100644 --- a/src/mongo/db/exec/projection.h +++ b/src/mongo/db/exec/projection.h @@ -37,123 +37,122 @@ namespace mongo { - struct ProjectionStageParams { - enum ProjectionImplementation { - // The default case. Will handle every projection. - NO_FAST_PATH, +struct ProjectionStageParams { + enum ProjectionImplementation { + // The default case. Will handle every projection. + NO_FAST_PATH, - // The projection is simple inclusion and is totally covered by one index. - COVERED_ONE_INDEX, + // The projection is simple inclusion and is totally covered by one index. + COVERED_ONE_INDEX, - // The projection is simple inclusion and we expect an object. - SIMPLE_DOC - }; + // The projection is simple inclusion and we expect an object. + SIMPLE_DOC + }; - ProjectionStageParams(const MatchExpressionParser::WhereCallback& wc) - : projImpl(NO_FAST_PATH), fullExpression(NULL), whereCallback(&wc) { } + ProjectionStageParams(const MatchExpressionParser::WhereCallback& wc) + : projImpl(NO_FAST_PATH), fullExpression(NULL), whereCallback(&wc) {} - ProjectionImplementation projImpl; + ProjectionImplementation projImpl; - // The projection object. We lack a ProjectionExpression or similar so we use a BSONObj. - BSONObj projObj; + // The projection object. We lack a ProjectionExpression or similar so we use a BSONObj. + BSONObj projObj; - // If we have a positional or elemMatch projection we need a MatchExpression to pull out the - // right data. - // Not owned here, we do not take ownership. - const MatchExpression* fullExpression; + // If we have a positional or elemMatch projection we need a MatchExpression to pull out the + // right data. + // Not owned here, we do not take ownership. + const MatchExpression* fullExpression; - // If (COVERED_ONE_INDEX == projObj) this is the key pattern we're extracting covered data - // from. Otherwise, this field is ignored. - BSONObj coveredKeyObj; + // If (COVERED_ONE_INDEX == projObj) this is the key pattern we're extracting covered data + // from. Otherwise, this field is ignored. + BSONObj coveredKeyObj; - // Used for creating context for the $where clause processing. Not owned. - const MatchExpressionParser::WhereCallback* whereCallback; - }; + // Used for creating context for the $where clause processing. Not owned. + const MatchExpressionParser::WhereCallback* whereCallback; +}; - /** - * This stage computes a projection. - */ - class ProjectionStage : public PlanStage { - public: - ProjectionStage(const ProjectionStageParams& params, - WorkingSet* ws, - PlanStage* child); +/** + * This stage computes a projection. + */ +class ProjectionStage : public PlanStage { +public: + ProjectionStage(const ProjectionStageParams& params, WorkingSet* ws, PlanStage* child); - virtual ~ProjectionStage(); + virtual ~ProjectionStage(); - virtual bool isEOF(); - virtual StageState work(WorkingSetID* out); + virtual bool isEOF(); + virtual StageState work(WorkingSetID* out); - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - virtual std::vector<PlanStage*> getChildren() const; + virtual std::vector<PlanStage*> getChildren() const; - virtual StageType stageType() const { return STAGE_PROJECTION; } + virtual StageType stageType() const { + return STAGE_PROJECTION; + } - PlanStageStats* getStats(); + PlanStageStats* getStats(); - virtual const CommonStats* getCommonStats() const; + virtual const CommonStats* getCommonStats() const; - virtual const SpecificStats* getSpecificStats() const; + virtual const SpecificStats* getSpecificStats() const; - typedef unordered_set<StringData, StringData::Hasher> FieldSet; + typedef unordered_set<StringData, StringData::Hasher> FieldSet; - /** - * Given the projection spec for a simple inclusion projection, - * 'projObj', populates 'includedFields' with the set of field - * names to be included. - */ - static void getSimpleInclusionFields(const BSONObj& projObj, - FieldSet* includedFields); + /** + * Given the projection spec for a simple inclusion projection, + * 'projObj', populates 'includedFields' with the set of field + * names to be included. + */ + static void getSimpleInclusionFields(const BSONObj& projObj, FieldSet* includedFields); - /** - * Applies a simple inclusion projection to 'in', including - * only the fields specified by 'includedFields'. - * - * The resulting document is constructed using 'bob'. - */ - static void transformSimpleInclusion(const BSONObj& in, - const FieldSet& includedFields, - BSONObjBuilder& bob); + /** + * Applies a simple inclusion projection to 'in', including + * only the fields specified by 'includedFields'. + * + * The resulting document is constructed using 'bob'. + */ + static void transformSimpleInclusion(const BSONObj& in, + const FieldSet& includedFields, + BSONObjBuilder& bob); - static const char* kStageType; + static const char* kStageType; - private: - Status transform(WorkingSetMember* member); +private: + Status transform(WorkingSetMember* member); - std::unique_ptr<ProjectionExec> _exec; + std::unique_ptr<ProjectionExec> _exec; - // _ws is not owned by us. - WorkingSet* _ws; - std::unique_ptr<PlanStage> _child; + // _ws is not owned by us. + WorkingSet* _ws; + std::unique_ptr<PlanStage> _child; - // Stats - CommonStats _commonStats; - ProjectionStats _specificStats; + // Stats + CommonStats _commonStats; + ProjectionStats _specificStats; - // Fast paths: - ProjectionStageParams::ProjectionImplementation _projImpl; + // Fast paths: + ProjectionStageParams::ProjectionImplementation _projImpl; - // Used by all projection implementations. - BSONObj _projObj; + // Used by all projection implementations. + BSONObj _projObj; - // Data used for both SIMPLE_DOC and COVERED_ONE_INDEX paths. - // Has the field names present in the simple projection. - unordered_set<StringData, StringData::Hasher> _includedFields; + // Data used for both SIMPLE_DOC and COVERED_ONE_INDEX paths. + // Has the field names present in the simple projection. + unordered_set<StringData, StringData::Hasher> _includedFields; - // - // Used for the COVERED_ONE_INDEX path. - // - BSONObj _coveredKeyObj; + // + // Used for the COVERED_ONE_INDEX path. + // + BSONObj _coveredKeyObj; - // Field names can be empty in 2.4 and before so we can't use them as a sentinel value. - // If the i-th entry is true we include the i-th field in the key. - std::vector<bool> _includeKey; + // Field names can be empty in 2.4 and before so we can't use them as a sentinel value. + // If the i-th entry is true we include the i-th field in the key. + std::vector<bool> _includeKey; - // If the i-th entry of _includeKey is true this is the field name for the i-th key field. - std::vector<StringData> _keyFieldNames; - }; + // If the i-th entry of _includeKey is true this is the field name for the i-th key field. + std::vector<StringData> _keyFieldNames; +}; } // namespace mongo diff --git a/src/mongo/db/exec/projection_exec.cpp b/src/mongo/db/exec/projection_exec.cpp index a200f17d381..80a2d8772af 100644 --- a/src/mongo/db/exec/projection_exec.cpp +++ b/src/mongo/db/exec/projection_exec.cpp @@ -36,433 +36,405 @@ namespace mongo { - using std::max; - using std::string; - - ProjectionExec::ProjectionExec() - : _include(true), - _special(false), - _includeID(true), - _skip(0), - _limit(-1), - _arrayOpType(ARRAY_OP_NORMAL), - _hasNonSimple(false), - _hasDottedField(false), - _queryExpression(NULL), - _hasReturnKey(false) { } - - - ProjectionExec::ProjectionExec(const BSONObj& spec, - const MatchExpression* queryExpression, - const MatchExpressionParser::WhereCallback& whereCallback) - : _include(true), - _special(false), - _source(spec), - _includeID(true), - _skip(0), - _limit(-1), - _arrayOpType(ARRAY_OP_NORMAL), - _hasNonSimple(false), - _hasDottedField(false), - _queryExpression(queryExpression), - _hasReturnKey(false) { - - // Are we including or excluding fields? - // -1 when we haven't initialized it. - // 1 when we're including - // 0 when we're excluding. - int include_exclude = -1; - - BSONObjIterator it(_source); - while (it.more()) { - BSONElement e = it.next(); - - if (!e.isNumber() && !e.isBoolean()) { - _hasNonSimple = true; - } +using std::max; +using std::string; + +ProjectionExec::ProjectionExec() + : _include(true), + _special(false), + _includeID(true), + _skip(0), + _limit(-1), + _arrayOpType(ARRAY_OP_NORMAL), + _hasNonSimple(false), + _hasDottedField(false), + _queryExpression(NULL), + _hasReturnKey(false) {} + + +ProjectionExec::ProjectionExec(const BSONObj& spec, + const MatchExpression* queryExpression, + const MatchExpressionParser::WhereCallback& whereCallback) + : _include(true), + _special(false), + _source(spec), + _includeID(true), + _skip(0), + _limit(-1), + _arrayOpType(ARRAY_OP_NORMAL), + _hasNonSimple(false), + _hasDottedField(false), + _queryExpression(queryExpression), + _hasReturnKey(false) { + // Are we including or excluding fields? + // -1 when we haven't initialized it. + // 1 when we're including + // 0 when we're excluding. + int include_exclude = -1; + + BSONObjIterator it(_source); + while (it.more()) { + BSONElement e = it.next(); + + if (!e.isNumber() && !e.isBoolean()) { + _hasNonSimple = true; + } - if (Object == e.type()) { - BSONObj obj = e.embeddedObject(); - verify(1 == obj.nFields()); - - BSONElement e2 = obj.firstElement(); - if (mongoutils::str::equals(e2.fieldName(), "$slice")) { - if (e2.isNumber()) { - int i = e2.numberInt(); - if (i < 0) { - add(e.fieldName(), i, -i); // limit is now positive - } - else { - add(e.fieldName(), 0, i); - } + if (Object == e.type()) { + BSONObj obj = e.embeddedObject(); + verify(1 == obj.nFields()); + + BSONElement e2 = obj.firstElement(); + if (mongoutils::str::equals(e2.fieldName(), "$slice")) { + if (e2.isNumber()) { + int i = e2.numberInt(); + if (i < 0) { + add(e.fieldName(), i, -i); // limit is now positive + } else { + add(e.fieldName(), 0, i); } - else { - verify(e2.type() == Array); - BSONObj arr = e2.embeddedObject(); - verify(2 == arr.nFields()); + } else { + verify(e2.type() == Array); + BSONObj arr = e2.embeddedObject(); + verify(2 == arr.nFields()); - BSONObjIterator it(arr); - int skip = it.next().numberInt(); - int limit = it.next().numberInt(); + BSONObjIterator it(arr); + int skip = it.next().numberInt(); + int limit = it.next().numberInt(); - verify(limit > 0); + verify(limit > 0); - add(e.fieldName(), skip, limit); - } - } - else if (mongoutils::str::equals(e2.fieldName(), "$elemMatch")) { - _arrayOpType = ARRAY_OP_ELEM_MATCH; - - // Create a MatchExpression for the elemMatch. - BSONObj elemMatchObj = e.wrap(); - verify(elemMatchObj.isOwned()); - _elemMatchObjs.push_back(elemMatchObj); - StatusWithMatchExpression swme = MatchExpressionParser::parse(elemMatchObj, - whereCallback); - verify(swme.isOK()); - // And store it in _matchers. - _matchers[mongoutils::str::before(e.fieldName(), '.').c_str()] - = swme.getValue(); - - add(e.fieldName(), true); - } - else if (mongoutils::str::equals(e2.fieldName(), "$meta")) { - verify(String == e2.type()); - if (e2.valuestr() == LiteParsedQuery::metaTextScore) { - _meta[e.fieldName()] = META_TEXT_SCORE; - } - else if (e2.valuestr() == LiteParsedQuery::metaRecordId) { - _meta[e.fieldName()] = META_RECORDID; - } - else if (e2.valuestr() == LiteParsedQuery::metaGeoNearPoint) { - _meta[e.fieldName()] = META_GEONEAR_POINT; - } - else if (e2.valuestr() == LiteParsedQuery::metaGeoNearDistance) { - _meta[e.fieldName()] = META_GEONEAR_DIST; - } - else if (e2.valuestr() == LiteParsedQuery::metaIndexKey) { - _hasReturnKey = true; - // The index key clobbers everything so just stop parsing here. - return; - } - else { - // This shouldn't happen, should be caught by parsing. - verify(0); - } + add(e.fieldName(), skip, limit); } - else { + } else if (mongoutils::str::equals(e2.fieldName(), "$elemMatch")) { + _arrayOpType = ARRAY_OP_ELEM_MATCH; + + // Create a MatchExpression for the elemMatch. + BSONObj elemMatchObj = e.wrap(); + verify(elemMatchObj.isOwned()); + _elemMatchObjs.push_back(elemMatchObj); + StatusWithMatchExpression swme = + MatchExpressionParser::parse(elemMatchObj, whereCallback); + verify(swme.isOK()); + // And store it in _matchers. + _matchers[mongoutils::str::before(e.fieldName(), '.').c_str()] = swme.getValue(); + + add(e.fieldName(), true); + } else if (mongoutils::str::equals(e2.fieldName(), "$meta")) { + verify(String == e2.type()); + if (e2.valuestr() == LiteParsedQuery::metaTextScore) { + _meta[e.fieldName()] = META_TEXT_SCORE; + } else if (e2.valuestr() == LiteParsedQuery::metaRecordId) { + _meta[e.fieldName()] = META_RECORDID; + } else if (e2.valuestr() == LiteParsedQuery::metaGeoNearPoint) { + _meta[e.fieldName()] = META_GEONEAR_POINT; + } else if (e2.valuestr() == LiteParsedQuery::metaGeoNearDistance) { + _meta[e.fieldName()] = META_GEONEAR_DIST; + } else if (e2.valuestr() == LiteParsedQuery::metaIndexKey) { + _hasReturnKey = true; + // The index key clobbers everything so just stop parsing here. + return; + } else { + // This shouldn't happen, should be caught by parsing. verify(0); } + } else { + verify(0); } - else if (mongoutils::str::equals(e.fieldName(), "_id") && !e.trueValue()) { - _includeID = false; + } else if (mongoutils::str::equals(e.fieldName(), "_id") && !e.trueValue()) { + _includeID = false; + } else { + add(e.fieldName(), e.trueValue()); + + // Projections of dotted fields aren't covered. + if (mongoutils::str::contains(e.fieldName(), '.')) { + _hasDottedField = true; } - else { - add(e.fieldName(), e.trueValue()); - - // Projections of dotted fields aren't covered. - if (mongoutils::str::contains(e.fieldName(), '.')) { - _hasDottedField = true; - } - // Validate input. - if (include_exclude == -1) { - // If we haven't specified an include/exclude, initialize include_exclude. - // We expect further include/excludes to match it. - include_exclude = e.trueValue(); - _include = !e.trueValue(); - } - } - - if (mongoutils::str::contains(e.fieldName(), ".$")) { - _arrayOpType = ARRAY_OP_POSITIONAL; + // Validate input. + if (include_exclude == -1) { + // If we haven't specified an include/exclude, initialize include_exclude. + // We expect further include/excludes to match it. + include_exclude = e.trueValue(); + _include = !e.trueValue(); } } - } - ProjectionExec::~ProjectionExec() { - for (FieldMap::const_iterator it = _fields.begin(); it != _fields.end(); ++it) { - delete it->second; + if (mongoutils::str::contains(e.fieldName(), ".$")) { + _arrayOpType = ARRAY_OP_POSITIONAL; } + } +} - for (Matchers::const_iterator it = _matchers.begin(); it != _matchers.end(); ++it) { - delete it->second; - } +ProjectionExec::~ProjectionExec() { + for (FieldMap::const_iterator it = _fields.begin(); it != _fields.end(); ++it) { + delete it->second; } - void ProjectionExec::add(const string& field, bool include) { - if (field.empty()) { // this is the field the user referred to - _include = include; - } - else { - _include = !include; + for (Matchers::const_iterator it = _matchers.begin(); it != _matchers.end(); ++it) { + delete it->second; + } +} - const size_t dot = field.find('.'); - const string subfield = field.substr(0,dot); - const string rest = (dot == string::npos ? "" : field.substr(dot + 1, string::npos)); +void ProjectionExec::add(const string& field, bool include) { + if (field.empty()) { // this is the field the user referred to + _include = include; + } else { + _include = !include; - ProjectionExec*& fm = _fields[subfield.c_str()]; + const size_t dot = field.find('.'); + const string subfield = field.substr(0, dot); + const string rest = (dot == string::npos ? "" : field.substr(dot + 1, string::npos)); - if (NULL == fm) { - fm = new ProjectionExec(); - } + ProjectionExec*& fm = _fields[subfield.c_str()]; - fm->add(rest, include); + if (NULL == fm) { + fm = new ProjectionExec(); } - } - void ProjectionExec::add(const string& field, int skip, int limit) { - _special = true; // can't include or exclude whole object + fm->add(rest, include); + } +} - if (field.empty()) { // this is the field the user referred to - _skip = skip; - _limit = limit; - } - else { - const size_t dot = field.find('.'); - const string subfield = field.substr(0,dot); - const string rest = (dot == string::npos ? "" : field.substr(dot + 1, string::npos)); +void ProjectionExec::add(const string& field, int skip, int limit) { + _special = true; // can't include or exclude whole object - ProjectionExec*& fm = _fields[subfield.c_str()]; + if (field.empty()) { // this is the field the user referred to + _skip = skip; + _limit = limit; + } else { + const size_t dot = field.find('.'); + const string subfield = field.substr(0, dot); + const string rest = (dot == string::npos ? "" : field.substr(dot + 1, string::npos)); - if (NULL == fm) { - fm = new ProjectionExec(); - } + ProjectionExec*& fm = _fields[subfield.c_str()]; - fm->add(rest, skip, limit); + if (NULL == fm) { + fm = new ProjectionExec(); } - } - // - // Execution - // + fm->add(rest, skip, limit); + } +} - Status ProjectionExec::transform(WorkingSetMember* member) const { - if (_hasReturnKey) { - BSONObj keyObj; +// +// Execution +// - if (member->hasComputed(WSM_INDEX_KEY)) { - const IndexKeyComputedData* key - = static_cast<const IndexKeyComputedData*>(member->getComputed(WSM_INDEX_KEY)); - keyObj = key->getKey(); - } +Status ProjectionExec::transform(WorkingSetMember* member) const { + if (_hasReturnKey) { + BSONObj keyObj; - member->state = WorkingSetMember::OWNED_OBJ; - member->obj = Snapshotted<BSONObj>(SnapshotId(), keyObj); - member->keyData.clear(); - member->loc = RecordId(); - return Status::OK(); + if (member->hasComputed(WSM_INDEX_KEY)) { + const IndexKeyComputedData* key = + static_cast<const IndexKeyComputedData*>(member->getComputed(WSM_INDEX_KEY)); + keyObj = key->getKey(); } - BSONObjBuilder bob; - if (member->hasObj()) { - MatchDetails matchDetails; + member->state = WorkingSetMember::OWNED_OBJ; + member->obj = Snapshotted<BSONObj>(SnapshotId(), keyObj); + member->keyData.clear(); + member->loc = RecordId(); + return Status::OK(); + } - // If it's a positional projection we need a MatchDetails. - if (transformRequiresDetails()) { - matchDetails.requestElemMatchKey(); - verify(NULL != _queryExpression); - verify(_queryExpression->matchesBSON(member->obj.value(), &matchDetails)); - } + BSONObjBuilder bob; + if (member->hasObj()) { + MatchDetails matchDetails; - Status projStatus = transform(member->obj.value(), &bob, &matchDetails); - if (!projStatus.isOK()) { - return projStatus; - } + // If it's a positional projection we need a MatchDetails. + if (transformRequiresDetails()) { + matchDetails.requestElemMatchKey(); + verify(NULL != _queryExpression); + verify(_queryExpression->matchesBSON(member->obj.value(), &matchDetails)); } - else { - verify(!requiresDocument()); - // Go field by field. - if (_includeID) { - BSONElement elt; - // Sometimes the _id field doesn't exist... - if (member->getFieldDotted("_id", &elt) && !elt.eoo()) { - bob.appendAs(elt, "_id"); - } + + Status projStatus = transform(member->obj.value(), &bob, &matchDetails); + if (!projStatus.isOK()) { + return projStatus; + } + } else { + verify(!requiresDocument()); + // Go field by field. + if (_includeID) { + BSONElement elt; + // Sometimes the _id field doesn't exist... + if (member->getFieldDotted("_id", &elt) && !elt.eoo()) { + bob.appendAs(elt, "_id"); } + } - BSONObjIterator it(_source); - while (it.more()) { - BSONElement specElt = it.next(); - if (mongoutils::str::equals("_id", specElt.fieldName())) { - continue; - } + BSONObjIterator it(_source); + while (it.more()) { + BSONElement specElt = it.next(); + if (mongoutils::str::equals("_id", specElt.fieldName())) { + continue; + } - BSONElement keyElt; - // We can project a field that doesn't exist. We just ignore it. - if (member->getFieldDotted(specElt.fieldName(), &keyElt) && !keyElt.eoo()) { - bob.appendAs(keyElt, specElt.fieldName()); - } + BSONElement keyElt; + // We can project a field that doesn't exist. We just ignore it. + if (member->getFieldDotted(specElt.fieldName(), &keyElt) && !keyElt.eoo()) { + bob.appendAs(keyElt, specElt.fieldName()); } } + } - for (MetaMap::const_iterator it = _meta.begin(); it != _meta.end(); ++it) { - if (META_GEONEAR_DIST == it->second) { - if (member->hasComputed(WSM_COMPUTED_GEO_DISTANCE)) { - const GeoDistanceComputedData* dist - = static_cast<const GeoDistanceComputedData*>( - member->getComputed(WSM_COMPUTED_GEO_DISTANCE)); - bob.append(it->first, dist->getDist()); - } - else { - return Status(ErrorCodes::InternalError, - "near loc dist requested but no data available"); - } + for (MetaMap::const_iterator it = _meta.begin(); it != _meta.end(); ++it) { + if (META_GEONEAR_DIST == it->second) { + if (member->hasComputed(WSM_COMPUTED_GEO_DISTANCE)) { + const GeoDistanceComputedData* dist = static_cast<const GeoDistanceComputedData*>( + member->getComputed(WSM_COMPUTED_GEO_DISTANCE)); + bob.append(it->first, dist->getDist()); + } else { + return Status(ErrorCodes::InternalError, + "near loc dist requested but no data available"); } - else if (META_GEONEAR_POINT == it->second) { - if (member->hasComputed(WSM_GEO_NEAR_POINT)) { - const GeoNearPointComputedData* point - = static_cast<const GeoNearPointComputedData*>( - member->getComputed(WSM_GEO_NEAR_POINT)); - BSONObj ptObj = point->getPoint(); - if (ptObj.couldBeArray()) { - bob.appendArray(it->first, ptObj); - } - else { - bob.append(it->first, ptObj); - } - } - else { - return Status(ErrorCodes::InternalError, - "near loc proj requested but no data available"); - } - } - else if (META_TEXT_SCORE == it->second) { - if (member->hasComputed(WSM_COMPUTED_TEXT_SCORE)) { - const TextScoreComputedData* score - = static_cast<const TextScoreComputedData*>( - member->getComputed(WSM_COMPUTED_TEXT_SCORE)); - bob.append(it->first, score->getScore()); - } - else { - bob.append(it->first, 0.0); + } else if (META_GEONEAR_POINT == it->second) { + if (member->hasComputed(WSM_GEO_NEAR_POINT)) { + const GeoNearPointComputedData* point = + static_cast<const GeoNearPointComputedData*>( + member->getComputed(WSM_GEO_NEAR_POINT)); + BSONObj ptObj = point->getPoint(); + if (ptObj.couldBeArray()) { + bob.appendArray(it->first, ptObj); + } else { + bob.append(it->first, ptObj); } + } else { + return Status(ErrorCodes::InternalError, + "near loc proj requested but no data available"); } - else if (META_RECORDID == it->second) { - bob.append(it->first, static_cast<long long>(member->loc.repr())); + } else if (META_TEXT_SCORE == it->second) { + if (member->hasComputed(WSM_COMPUTED_TEXT_SCORE)) { + const TextScoreComputedData* score = static_cast<const TextScoreComputedData*>( + member->getComputed(WSM_COMPUTED_TEXT_SCORE)); + bob.append(it->first, score->getScore()); + } else { + bob.append(it->first, 0.0); } + } else if (META_RECORDID == it->second) { + bob.append(it->first, static_cast<long long>(member->loc.repr())); } - - BSONObj newObj = bob.obj(); - member->state = WorkingSetMember::OWNED_OBJ; - member->obj = Snapshotted<BSONObj>(SnapshotId(), newObj); - member->keyData.clear(); - member->loc = RecordId(); - - return Status::OK(); } - Status ProjectionExec::transform(const BSONObj& in, BSONObj* out) const { - // If it's a positional projection we need a MatchDetails. - MatchDetails matchDetails; - if (transformRequiresDetails()) { - matchDetails.requestElemMatchKey(); - verify(NULL != _queryExpression); - verify(_queryExpression->matchesBSON(in, &matchDetails)); - } - - BSONObjBuilder bob; - Status s = transform(in, &bob, &matchDetails); - if (!s.isOK()) { - return s; - } - *out = bob.obj(); - return Status::OK(); + BSONObj newObj = bob.obj(); + member->state = WorkingSetMember::OWNED_OBJ; + member->obj = Snapshotted<BSONObj>(SnapshotId(), newObj); + member->keyData.clear(); + member->loc = RecordId(); + + return Status::OK(); +} + +Status ProjectionExec::transform(const BSONObj& in, BSONObj* out) const { + // If it's a positional projection we need a MatchDetails. + MatchDetails matchDetails; + if (transformRequiresDetails()) { + matchDetails.requestElemMatchKey(); + verify(NULL != _queryExpression); + verify(_queryExpression->matchesBSON(in, &matchDetails)); } - Status ProjectionExec::transform(const BSONObj& in, - BSONObjBuilder* bob, - const MatchDetails* details) const { + BSONObjBuilder bob; + Status s = transform(in, &bob, &matchDetails); + if (!s.isOK()) { + return s; + } + *out = bob.obj(); + return Status::OK(); +} - const ArrayOpType& arrayOpType = _arrayOpType; +Status ProjectionExec::transform(const BSONObj& in, + BSONObjBuilder* bob, + const MatchDetails* details) const { + const ArrayOpType& arrayOpType = _arrayOpType; - BSONObjIterator it(in); - while (it.more()) { - BSONElement elt = it.next(); + BSONObjIterator it(in); + while (it.more()) { + BSONElement elt = it.next(); - // Case 1: _id - if (mongoutils::str::equals("_id", elt.fieldName())) { - if (_includeID) { - bob->append(elt); - } - continue; + // Case 1: _id + if (mongoutils::str::equals("_id", elt.fieldName())) { + if (_includeID) { + bob->append(elt); } + continue; + } - // Case 2: no array projection for this field. - Matchers::const_iterator matcher = _matchers.find(elt.fieldName()); - if (_matchers.end() == matcher) { - Status s = append(bob, elt, details, arrayOpType); - if (!s.isOK()) { - return s; - } - continue; + // Case 2: no array projection for this field. + Matchers::const_iterator matcher = _matchers.find(elt.fieldName()); + if (_matchers.end() == matcher) { + Status s = append(bob, elt, details, arrayOpType); + if (!s.isOK()) { + return s; } + continue; + } - // Case 3: field has array projection with $elemMatch specified. - if (ARRAY_OP_ELEM_MATCH != arrayOpType) { - return Status(ErrorCodes::BadValue, - "Matchers are only supported for $elemMatch"); - } + // Case 3: field has array projection with $elemMatch specified. + if (ARRAY_OP_ELEM_MATCH != arrayOpType) { + return Status(ErrorCodes::BadValue, "Matchers are only supported for $elemMatch"); + } - MatchDetails arrayDetails; - arrayDetails.requestElemMatchKey(); + MatchDetails arrayDetails; + arrayDetails.requestElemMatchKey(); - if (matcher->second->matchesBSON(in, &arrayDetails)) { - FieldMap::const_iterator fieldIt = _fields.find(elt.fieldName()); - if (_fields.end() == fieldIt) { - return Status(ErrorCodes::BadValue, - "$elemMatch specified, but projection field not found."); - } + if (matcher->second->matchesBSON(in, &arrayDetails)) { + FieldMap::const_iterator fieldIt = _fields.find(elt.fieldName()); + if (_fields.end() == fieldIt) { + return Status(ErrorCodes::BadValue, + "$elemMatch specified, but projection field not found."); + } - BSONArrayBuilder arrBuilder; - BSONObjBuilder subBob; + BSONArrayBuilder arrBuilder; + BSONObjBuilder subBob; - if (in.getField(elt.fieldName()).eoo()) { - return Status(ErrorCodes::InternalError, - "$elemMatch called on document element with eoo"); - } + if (in.getField(elt.fieldName()).eoo()) { + return Status(ErrorCodes::InternalError, + "$elemMatch called on document element with eoo"); + } - if (in.getField(elt.fieldName()).Obj().getField(arrayDetails.elemMatchKey()).eoo()) { - return Status(ErrorCodes::InternalError, - "$elemMatch called on array element with eoo"); - } + if (in.getField(elt.fieldName()).Obj().getField(arrayDetails.elemMatchKey()).eoo()) { + return Status(ErrorCodes::InternalError, + "$elemMatch called on array element with eoo"); + } - arrBuilder.append( - in.getField(elt.fieldName()).Obj().getField(arrayDetails.elemMatchKey())); - subBob.appendArray(matcher->first, arrBuilder.arr()); - Status status = append(bob, subBob.done().firstElement(), details, arrayOpType); - if (!status.isOK()) { - return status; - } + arrBuilder.append( + in.getField(elt.fieldName()).Obj().getField(arrayDetails.elemMatchKey())); + subBob.appendArray(matcher->first, arrBuilder.arr()); + Status status = append(bob, subBob.done().firstElement(), details, arrayOpType); + if (!status.isOK()) { + return status; } } - - return Status::OK(); } - void ProjectionExec::appendArray(BSONObjBuilder* bob, const BSONObj& array, bool nested) const { - int skip = nested ? 0 : _skip; - int limit = nested ? -1 : _limit; + return Status::OK(); +} - if (skip < 0) { - skip = max(0, skip + array.nFields()); - } +void ProjectionExec::appendArray(BSONObjBuilder* bob, const BSONObj& array, bool nested) const { + int skip = nested ? 0 : _skip; + int limit = nested ? -1 : _limit; - int index = 0; - BSONObjIterator it(array); - while (it.more()) { - BSONElement elt = it.next(); + if (skip < 0) { + skip = max(0, skip + array.nFields()); + } - if (skip) { - skip--; - continue; - } + int index = 0; + BSONObjIterator it(array); + while (it.more()) { + BSONElement elt = it.next(); - if (limit != -1 && (limit-- == 0)) { - break; - } + if (skip) { + skip--; + continue; + } - switch(elt.type()) { + if (limit != -1 && (limit-- == 0)) { + break; + } + + switch (elt.type()) { case Array: { BSONObjBuilder subBob; appendArray(&subBob, elt.embeddedObject(), true); @@ -482,76 +454,70 @@ namespace mongo { if (_include) { bob->appendAs(elt, bob->numStr(index++)); } - } } } +} + +Status ProjectionExec::append(BSONObjBuilder* bob, + const BSONElement& elt, + const MatchDetails* details, + const ArrayOpType arrayOpType) const { + // Skip if the field name matches a computed $meta field. + // $meta projection fields can exist at the top level of + // the result document and the field names cannot be dotted. + if (_meta.find(elt.fieldName()) != _meta.end()) { + return Status::OK(); + } - Status ProjectionExec::append(BSONObjBuilder* bob, - const BSONElement& elt, - const MatchDetails* details, - const ArrayOpType arrayOpType) const { - - - // Skip if the field name matches a computed $meta field. - // $meta projection fields can exist at the top level of - // the result document and the field names cannot be dotted. - if (_meta.find(elt.fieldName()) != _meta.end()) { - return Status::OK(); + FieldMap::const_iterator field = _fields.find(elt.fieldName()); + if (field == _fields.end()) { + if (_include) { + bob->append(elt); } + return Status::OK(); + } - FieldMap::const_iterator field = _fields.find(elt.fieldName()); - if (field == _fields.end()) { - if (_include) { - bob->append(elt); - } - return Status::OK(); + ProjectionExec& subfm = *field->second; + if ((subfm._fields.empty() && !subfm._special) || + !(elt.type() == Object || elt.type() == Array)) { + // field map empty, or element is not an array/object + if (subfm._include) { + bob->append(elt); } - - ProjectionExec& subfm = *field->second; - if ((subfm._fields.empty() && !subfm._special) - || !(elt.type() == Object || elt.type() == Array)) { - // field map empty, or element is not an array/object - if (subfm._include) { - bob->append(elt); - } + } else if (elt.type() == Object) { + BSONObjBuilder subBob; + BSONObjIterator it(elt.embeddedObject()); + while (it.more()) { + subfm.append(&subBob, it.next(), details, arrayOpType); } - else if (elt.type() == Object) { - BSONObjBuilder subBob; - BSONObjIterator it(elt.embeddedObject()); - while (it.more()) { - subfm.append(&subBob, it.next(), details, arrayOpType); + bob->append(elt.fieldName(), subBob.obj()); + } else { + // Array + BSONObjBuilder matchedBuilder; + if (details && arrayOpType == ARRAY_OP_POSITIONAL) { + // $ positional operator specified + if (!details->hasElemMatchKey()) { + mongoutils::str::stream error; + error << "positional operator (" << elt.fieldName() + << ".$) requires corresponding field" + << " in query specifier"; + return Status(ErrorCodes::BadValue, error); } - bob->append(elt.fieldName(), subBob.obj()); - } - else { - // Array - BSONObjBuilder matchedBuilder; - if (details && arrayOpType == ARRAY_OP_POSITIONAL) { - // $ positional operator specified - if (!details->hasElemMatchKey()) { - mongoutils::str::stream error; - error << "positional operator (" << elt.fieldName() - << ".$) requires corresponding field" - << " in query specifier"; - return Status(ErrorCodes::BadValue, error); - } - - if (elt.embeddedObject()[details->elemMatchKey()].eoo()) { - return Status(ErrorCodes::BadValue, - "positional operator element mismatch"); - } - // append as the first and only element in the projected array - matchedBuilder.appendAs( elt.embeddedObject()[details->elemMatchKey()], "0" ); - } - else { - // append exact array; no subarray matcher specified - subfm.appendArray(&matchedBuilder, elt.embeddedObject()); + if (elt.embeddedObject()[details->elemMatchKey()].eoo()) { + return Status(ErrorCodes::BadValue, "positional operator element mismatch"); } - bob->appendArray(elt.fieldName(), matchedBuilder.obj()); - } - return Status::OK(); + // append as the first and only element in the projected array + matchedBuilder.appendAs(elt.embeddedObject()[details->elemMatchKey()], "0"); + } else { + // append exact array; no subarray matcher specified + subfm.appendArray(&matchedBuilder, elt.embeddedObject()); + } + bob->appendArray(elt.fieldName(), matchedBuilder.obj()); } + return Status::OK(); +} + } // namespace mongo diff --git a/src/mongo/db/exec/projection_exec.h b/src/mongo/db/exec/projection_exec.h index 6b8dd1456af..4a1382b4318 100644 --- a/src/mongo/db/exec/projection_exec.h +++ b/src/mongo/db/exec/projection_exec.h @@ -36,166 +36,162 @@ namespace mongo { - class ProjectionExec { - public: - /** - * A .find() projection can have an array operation, either an elemMatch or positional (or - * neither). - */ - enum ArrayOpType { - ARRAY_OP_NORMAL = 0, - ARRAY_OP_ELEM_MATCH, - ARRAY_OP_POSITIONAL - }; - - /** - * Projections based on data computed while answering a query, or other metadata about a - * document / query. - */ - enum MetaProjection { - META_TEXT_SCORE, - META_GEONEAR_DIST, - META_GEONEAR_POINT, - META_RECORDID, - META_IX_KEY, - }; - - /** - * TODO: document why we like StringMap so much here - */ - typedef StringMap<ProjectionExec*> FieldMap; - typedef StringMap<MatchExpression*> Matchers; - typedef StringMap<MetaProjection> MetaMap; - - ProjectionExec(const BSONObj& spec, - const MatchExpression* queryExpression, - const MatchExpressionParser::WhereCallback& whereCallback = - MatchExpressionParser::WhereCallback()); - ~ProjectionExec(); - - /** - * Apply this projection to the 'member'. Changes the type to OWNED_OBJ. - */ - Status transform(WorkingSetMember* member) const; - - /** - * Apply this projection to the object 'in'. - * - * Upon success, 'out' is set to the new object and Status::OK() is returned. - * Otherwise, returns an error Status and *out is not mutated. - */ - Status transform(const BSONObj& in, BSONObj* out) const; - - private: - // - // Initialization - // - - ProjectionExec(); - - /** - * Add 'field' as a field name that is included or excluded as part of the projection. - */ - void add(const std::string& field, bool include); - - /** - * Add 'field' as a field name that is sliced as part of the projection. - */ - void add(const std::string& field, int skip, int limit); - - // - // Execution - // - - /** - * Apply the projection that 'this' represents to the object 'in'. 'details' is the result - * of a match evaluation of the full query on the object 'in'. This is only required - * if the projection is positional. - * - * If the projection is successfully computed, returns Status::OK() and stuff the result in - * 'bob'. - * Otherwise, returns error. - */ - Status transform(const BSONObj& in, - BSONObjBuilder* bob, - const MatchDetails* details = NULL) const; - - /** - * See transform(...) above. - */ - bool transformRequiresDetails() const { - return ARRAY_OP_POSITIONAL == _arrayOpType; - } - - /** - * Is the full document required to compute this projection? - */ - bool requiresDocument() const { - return _include || _hasNonSimple || _hasDottedField; - } - - /** - * Appends the element 'e' to the builder 'bob', possibly descending into sub-fields of 'e' - * if needed. - */ - Status append(BSONObjBuilder* bob, - const BSONElement& elt, - const MatchDetails* details = NULL, - const ArrayOpType arrayOpType = ARRAY_OP_NORMAL) const; - - /** - * Like append, but for arrays. - * Deals with slice and calls appendArray to preserve the array-ness. - */ - void appendArray(BSONObjBuilder* bob, const BSONObj& array, bool nested = false) const; - - // True if default at this level is to include. - bool _include; - - // True if this level can't be skipped or included without recursing. - bool _special; - - // We must group projections with common prefixes together. - // TODO: benchmark std::vector<pair> vs map - // - // Projection is a rooted tree. If we have {a.b: 1, a.c: 1} we don't want to - // double-traverse the document when we're projecting it. Instead, we have an entry in - // _fields for 'a' with two sub projections: b:1 and c:1. - FieldMap _fields; - - // The raw projection spec. that is passed into init(...) - BSONObj _source; - - // Should we include the _id field? - bool _includeID; - - // Arguments from the $slice operator. - int _skip; - int _limit; - - // Used for $elemMatch and positional operator ($) - Matchers _matchers; - - // The matchers above point into BSONObjs and this is where those objs live. - std::vector<BSONObj> _elemMatchObjs; - - ArrayOpType _arrayOpType; - - // Is there an slice, elemMatch or meta operator? - bool _hasNonSimple; - - // Is there a projection over a dotted field or a $ positional operator? - bool _hasDottedField; - - // The full query expression. Used when we need MatchDetails. - const MatchExpression* _queryExpression; - - // Projections that aren't sourced from the document or index keys. - MetaMap _meta; - - // Do we have a returnKey projection? If so we *only* output the index key metadata. If - // it's not found we output nothing. - bool _hasReturnKey; +class ProjectionExec { +public: + /** + * A .find() projection can have an array operation, either an elemMatch or positional (or + * neither). + */ + enum ArrayOpType { ARRAY_OP_NORMAL = 0, ARRAY_OP_ELEM_MATCH, ARRAY_OP_POSITIONAL }; + + /** + * Projections based on data computed while answering a query, or other metadata about a + * document / query. + */ + enum MetaProjection { + META_TEXT_SCORE, + META_GEONEAR_DIST, + META_GEONEAR_POINT, + META_RECORDID, + META_IX_KEY, }; + /** + * TODO: document why we like StringMap so much here + */ + typedef StringMap<ProjectionExec*> FieldMap; + typedef StringMap<MatchExpression*> Matchers; + typedef StringMap<MetaProjection> MetaMap; + + ProjectionExec(const BSONObj& spec, + const MatchExpression* queryExpression, + const MatchExpressionParser::WhereCallback& whereCallback = + MatchExpressionParser::WhereCallback()); + ~ProjectionExec(); + + /** + * Apply this projection to the 'member'. Changes the type to OWNED_OBJ. + */ + Status transform(WorkingSetMember* member) const; + + /** + * Apply this projection to the object 'in'. + * + * Upon success, 'out' is set to the new object and Status::OK() is returned. + * Otherwise, returns an error Status and *out is not mutated. + */ + Status transform(const BSONObj& in, BSONObj* out) const; + +private: + // + // Initialization + // + + ProjectionExec(); + + /** + * Add 'field' as a field name that is included or excluded as part of the projection. + */ + void add(const std::string& field, bool include); + + /** + * Add 'field' as a field name that is sliced as part of the projection. + */ + void add(const std::string& field, int skip, int limit); + + // + // Execution + // + + /** + * Apply the projection that 'this' represents to the object 'in'. 'details' is the result + * of a match evaluation of the full query on the object 'in'. This is only required + * if the projection is positional. + * + * If the projection is successfully computed, returns Status::OK() and stuff the result in + * 'bob'. + * Otherwise, returns error. + */ + Status transform(const BSONObj& in, + BSONObjBuilder* bob, + const MatchDetails* details = NULL) const; + + /** + * See transform(...) above. + */ + bool transformRequiresDetails() const { + return ARRAY_OP_POSITIONAL == _arrayOpType; + } + + /** + * Is the full document required to compute this projection? + */ + bool requiresDocument() const { + return _include || _hasNonSimple || _hasDottedField; + } + + /** + * Appends the element 'e' to the builder 'bob', possibly descending into sub-fields of 'e' + * if needed. + */ + Status append(BSONObjBuilder* bob, + const BSONElement& elt, + const MatchDetails* details = NULL, + const ArrayOpType arrayOpType = ARRAY_OP_NORMAL) const; + + /** + * Like append, but for arrays. + * Deals with slice and calls appendArray to preserve the array-ness. + */ + void appendArray(BSONObjBuilder* bob, const BSONObj& array, bool nested = false) const; + + // True if default at this level is to include. + bool _include; + + // True if this level can't be skipped or included without recursing. + bool _special; + + // We must group projections with common prefixes together. + // TODO: benchmark std::vector<pair> vs map + // + // Projection is a rooted tree. If we have {a.b: 1, a.c: 1} we don't want to + // double-traverse the document when we're projecting it. Instead, we have an entry in + // _fields for 'a' with two sub projections: b:1 and c:1. + FieldMap _fields; + + // The raw projection spec. that is passed into init(...) + BSONObj _source; + + // Should we include the _id field? + bool _includeID; + + // Arguments from the $slice operator. + int _skip; + int _limit; + + // Used for $elemMatch and positional operator ($) + Matchers _matchers; + + // The matchers above point into BSONObjs and this is where those objs live. + std::vector<BSONObj> _elemMatchObjs; + + ArrayOpType _arrayOpType; + + // Is there an slice, elemMatch or meta operator? + bool _hasNonSimple; + + // Is there a projection over a dotted field or a $ positional operator? + bool _hasDottedField; + + // The full query expression. Used when we need MatchDetails. + const MatchExpression* _queryExpression; + + // Projections that aren't sourced from the document or index keys. + MetaMap _meta; + + // Do we have a returnKey projection? If so we *only* output the index key metadata. If + // it's not found we output nothing. + bool _hasReturnKey; +}; + } // namespace mongo diff --git a/src/mongo/db/exec/projection_exec_test.cpp b/src/mongo/db/exec/projection_exec_test.cpp index 56d9cbc5be2..fca1065cba0 100644 --- a/src/mongo/db/exec/projection_exec_test.cpp +++ b/src/mongo/db/exec/projection_exec_test.cpp @@ -42,178 +42,187 @@ using namespace mongo; namespace { - using std::unique_ptr; - - /** - * Utility function to create MatchExpression - */ - MatchExpression* parseMatchExpression(const BSONObj& obj) { - StatusWithMatchExpression status = MatchExpressionParser::parse(obj); - ASSERT_TRUE(status.isOK()); - MatchExpression* expr(status.getValue()); - return expr; - } - - // - // transform tests - // - - /** - * test function to verify results of transform() - * on a working set member. - * - * specStr - projection specification - * queryStr - query - * objStr - object to run projection on - * data - computed data. Owned by working set member created in this function if not null. - * expectedStatusOK - expected status of transformation - * expectedObjStr - expected object after successful projection. - * Ignored if expectedStatusOK is false. - */ - - void testTransform(const char* specStr, const char* queryStr, const char* objStr, - WorkingSetComputedData* data, - bool expectedStatusOK, const char* expectedObjStr) { - // Create projection exec object. - BSONObj spec = fromjson(specStr); - BSONObj query = fromjson(queryStr); - unique_ptr<MatchExpression> queryExpression(parseMatchExpression(query)); - ProjectionExec exec(spec, queryExpression.get()); - - // Create working set member. - WorkingSetMember wsm; - wsm.state = WorkingSetMember::OWNED_OBJ; - wsm.obj = Snapshotted<BSONObj>(SnapshotId(), fromjson(objStr)); - if (data) { - wsm.addComputed(data); - } - - // Transform object - Status status = exec.transform(&wsm); - - // There are fewer checks to perform if we are expected a failed status. - if (!expectedStatusOK) { - if (status.isOK()) { - mongoutils::str::stream ss; - ss << "expected transform() to fail but got success instead." - << "\nprojection spec: " << specStr - << "\nquery: " << queryStr - << "\nobject before projection: " << objStr; - FAIL(ss); - } - return; - } - - // If we are expecting a successful transformation but got a failed status instead, - // print out status message in assertion message. - if (!status.isOK()) { - mongoutils::str::stream ss; - ss << "transform() test failed: unexpected failed status: " << status.toString() - << "\nprojection spec: " << specStr - << "\nquery: " << queryStr - << "\nobject before projection: " << objStr - << "\nexpected object after projection: " << expectedObjStr; - FAIL(ss); - } - - // Finally, we compare the projected object. - const BSONObj& obj = wsm.obj.value(); - BSONObj expectedObj = fromjson(expectedObjStr); - if (obj != expectedObj) { - mongoutils::str::stream ss; - ss << "transform() test failed: unexpected projected object." - << "\nprojection spec: " << specStr - << "\nquery: " << queryStr - << "\nobject before projection: " << objStr - << "\nexpected object after projection: " << expectedObjStr - << "\nactual object after projection: " << obj.toString(); - FAIL(ss); - } - } +using std::unique_ptr; - /** - * testTransform without computed data argument. - */ - void testTransform(const char* specStr, const char* queryStr, const char* objStr, - bool expectedStatusOK, const char* expectedObjStr) { - testTransform(specStr, queryStr, objStr, NULL, expectedStatusOK, expectedObjStr); - } +/** + * Utility function to create MatchExpression + */ +MatchExpression* parseMatchExpression(const BSONObj& obj) { + StatusWithMatchExpression status = MatchExpressionParser::parse(obj); + ASSERT_TRUE(status.isOK()); + MatchExpression* expr(status.getValue()); + return expr; +} - // - // position $ - // +// +// transform tests +// - TEST(ProjectionExecTest, TransformPositionalDollar) { - // Valid position $ projections. - testTransform("{'a.$': 1}", "{a: 10}", "{a: [10, 20, 30]}", true, "{a: [10]}"); - testTransform("{'a.$': 1}", "{a: 20}", "{a: [10, 20, 30]}", true, "{a: [20]}"); - testTransform("{'a.$': 1}", "{a: 30}", "{a: [10, 20, 30]}", true, "{a: [30]}"); - testTransform("{'a.$': 1}", "{a: {$gt: 4}}", "{a: [5]}", true, "{a: [5]}"); +/** + * test function to verify results of transform() + * on a working set member. + * + * specStr - projection specification + * queryStr - query + * objStr - object to run projection on + * data - computed data. Owned by working set member created in this function if not null. + * expectedStatusOK - expected status of transformation + * expectedObjStr - expected object after successful projection. + * Ignored if expectedStatusOK is false. + */ - // Invalid position $ projections. - testTransform("{'a.$': 1}", "{a: {$size: 1}}", "{a: [5]}", false, ""); +void testTransform(const char* specStr, + const char* queryStr, + const char* objStr, + WorkingSetComputedData* data, + bool expectedStatusOK, + const char* expectedObjStr) { + // Create projection exec object. + BSONObj spec = fromjson(specStr); + BSONObj query = fromjson(queryStr); + unique_ptr<MatchExpression> queryExpression(parseMatchExpression(query)); + ProjectionExec exec(spec, queryExpression.get()); + + // Create working set member. + WorkingSetMember wsm; + wsm.state = WorkingSetMember::OWNED_OBJ; + wsm.obj = Snapshotted<BSONObj>(SnapshotId(), fromjson(objStr)); + if (data) { + wsm.addComputed(data); } - // - // $elemMatch - // - - TEST(ProjectionExecTest, TransformElemMatch) { - const char* s = "{a: [{x: 1, y: 10}, {x: 1, y: 20}, {x: 2, y: 10}]}"; + // Transform object + Status status = exec.transform(&wsm); - // Valid $elemMatch projections. - testTransform("{a: {$elemMatch: {x: 1}}}", "{}", s, true, "{a: [{x: 1, y: 10}]}"); - testTransform("{a: {$elemMatch: {x: 1, y: 20}}}", "{}", s, true, "{a: [{x: 1, y: 20}]}"); - testTransform("{a: {$elemMatch: {x: 2}}}", "{}", s, true, "{a: [{x: 2, y: 10}]}"); - testTransform("{a: {$elemMatch: {x: 3}}}", "{}", s, true, "{}"); - - // $elemMatch on unknown field z - testTransform("{a: {$elemMatch: {z: 1}}}", "{}", s, true, "{}"); + // There are fewer checks to perform if we are expected a failed status. + if (!expectedStatusOK) { + if (status.isOK()) { + mongoutils::str::stream ss; + ss << "expected transform() to fail but got success instead." + << "\nprojection spec: " << specStr << "\nquery: " << queryStr + << "\nobject before projection: " << objStr; + FAIL(ss); + } + return; } - // - // $slice - // - - TEST(ProjectionExecTest, TransformSliceCount) { - // Valid $slice projections using format {$slice: count}. - testTransform("{a: {$slice: -10}}", "{}", "{a: [4, 6, 8]}", true, "{a: [4, 6, 8]}"); - testTransform("{a: {$slice: -3}}", "{}", "{a: [4, 6, 8]}", true, "{a: [4, 6, 8]}"); - testTransform("{a: {$slice: -1}}", "{}", "{a: [4, 6, 8]}", true, "{a: [8]}"); - testTransform("{a: {$slice: 0}}", "{}", "{a: [4, 6, 8]}", true, "{a: []}"); - testTransform("{a: {$slice: 1}}", "{}", "{a: [4, 6, 8]}", true, "{a: [4]}"); - testTransform("{a: {$slice: 3}}", "{}", "{a: [4, 6, 8]}", true, "{a: [4, 6, 8]}"); - testTransform("{a: {$slice: 10}}", "{}", "{a: [4, 6, 8]}", true, "{a: [4, 6, 8]}"); + // If we are expecting a successful transformation but got a failed status instead, + // print out status message in assertion message. + if (!status.isOK()) { + mongoutils::str::stream ss; + ss << "transform() test failed: unexpected failed status: " << status.toString() + << "\nprojection spec: " << specStr << "\nquery: " << queryStr + << "\nobject before projection: " << objStr + << "\nexpected object after projection: " << expectedObjStr; + FAIL(ss); } - TEST(ProjectionExecTest, TransformSliceSkipLimit) { - // Valid $slice projections using format {$slice: [skip, limit]}. - // Non-positive limits are rejected at the query parser and therefore not handled by - // the projection execution stage. In fact, it will abort on an invalid limit. - testTransform("{a: {$slice: [-10, 10]}}", "{}", "{a: [4, 6, 8]}", true, "{a: [4, 6, 8]}"); - testTransform("{a: {$slice: [-3, 5]}}", "{}", "{a: [4, 6, 8]}", true, "{a: [4, 6, 8]}"); - testTransform("{a: {$slice: [-1, 1]}}", "{}", "{a: [4, 6, 8]}", true, "{a: [8]}"); - testTransform("{a: {$slice: [0, 2]}}", "{}", "{a: [4, 6, 8]}", true, "{a: [4, 6]}"); - testTransform("{a: {$slice: [0, 1]}}", "{}", "{a: [4, 6, 8]}", true, "{a: [4]}"); - testTransform("{a: {$slice: [1, 1]}}", "{}", "{a: [4, 6, 8]}", true, "{a: [6]}"); - testTransform("{a: {$slice: [3, 5]}}", "{}", "{a: [4, 6, 8]}", true, "{a: []}"); - testTransform("{a: {$slice: [10, 10]}}", "{}", "{a: [4, 6, 8]}", true, "{a: []}"); + // Finally, we compare the projected object. + const BSONObj& obj = wsm.obj.value(); + BSONObj expectedObj = fromjson(expectedObjStr); + if (obj != expectedObj) { + mongoutils::str::stream ss; + ss << "transform() test failed: unexpected projected object." + << "\nprojection spec: " << specStr << "\nquery: " << queryStr + << "\nobject before projection: " << objStr + << "\nexpected object after projection: " << expectedObjStr + << "\nactual object after projection: " << obj.toString(); + FAIL(ss); } +} - // - // $meta - // $meta projections add computed values to the projected object. - // - - TEST(ProjectionExecTest, TransformMetaTextScore) { - // Query {} is ignored. - testTransform("{b: {$meta: 'textScore'}}", "{}", "{a: 'hello'}", - new mongo::TextScoreComputedData(100), - true, "{a: 'hello', b: 100}"); - // Projected meta field should overwrite existing field. - testTransform("{b: {$meta: 'textScore'}}", "{}", "{a: 'hello', b: -1}", - new mongo::TextScoreComputedData(100), - true, "{a: 'hello', b: 100}"); - } +/** + * testTransform without computed data argument. + */ +void testTransform(const char* specStr, + const char* queryStr, + const char* objStr, + bool expectedStatusOK, + const char* expectedObjStr) { + testTransform(specStr, queryStr, objStr, NULL, expectedStatusOK, expectedObjStr); +} + +// +// position $ +// + +TEST(ProjectionExecTest, TransformPositionalDollar) { + // Valid position $ projections. + testTransform("{'a.$': 1}", "{a: 10}", "{a: [10, 20, 30]}", true, "{a: [10]}"); + testTransform("{'a.$': 1}", "{a: 20}", "{a: [10, 20, 30]}", true, "{a: [20]}"); + testTransform("{'a.$': 1}", "{a: 30}", "{a: [10, 20, 30]}", true, "{a: [30]}"); + testTransform("{'a.$': 1}", "{a: {$gt: 4}}", "{a: [5]}", true, "{a: [5]}"); + + // Invalid position $ projections. + testTransform("{'a.$': 1}", "{a: {$size: 1}}", "{a: [5]}", false, ""); +} + +// +// $elemMatch +// + +TEST(ProjectionExecTest, TransformElemMatch) { + const char* s = "{a: [{x: 1, y: 10}, {x: 1, y: 20}, {x: 2, y: 10}]}"; + + // Valid $elemMatch projections. + testTransform("{a: {$elemMatch: {x: 1}}}", "{}", s, true, "{a: [{x: 1, y: 10}]}"); + testTransform("{a: {$elemMatch: {x: 1, y: 20}}}", "{}", s, true, "{a: [{x: 1, y: 20}]}"); + testTransform("{a: {$elemMatch: {x: 2}}}", "{}", s, true, "{a: [{x: 2, y: 10}]}"); + testTransform("{a: {$elemMatch: {x: 3}}}", "{}", s, true, "{}"); + + // $elemMatch on unknown field z + testTransform("{a: {$elemMatch: {z: 1}}}", "{}", s, true, "{}"); +} + +// +// $slice +// + +TEST(ProjectionExecTest, TransformSliceCount) { + // Valid $slice projections using format {$slice: count}. + testTransform("{a: {$slice: -10}}", "{}", "{a: [4, 6, 8]}", true, "{a: [4, 6, 8]}"); + testTransform("{a: {$slice: -3}}", "{}", "{a: [4, 6, 8]}", true, "{a: [4, 6, 8]}"); + testTransform("{a: {$slice: -1}}", "{}", "{a: [4, 6, 8]}", true, "{a: [8]}"); + testTransform("{a: {$slice: 0}}", "{}", "{a: [4, 6, 8]}", true, "{a: []}"); + testTransform("{a: {$slice: 1}}", "{}", "{a: [4, 6, 8]}", true, "{a: [4]}"); + testTransform("{a: {$slice: 3}}", "{}", "{a: [4, 6, 8]}", true, "{a: [4, 6, 8]}"); + testTransform("{a: {$slice: 10}}", "{}", "{a: [4, 6, 8]}", true, "{a: [4, 6, 8]}"); +} + +TEST(ProjectionExecTest, TransformSliceSkipLimit) { + // Valid $slice projections using format {$slice: [skip, limit]}. + // Non-positive limits are rejected at the query parser and therefore not handled by + // the projection execution stage. In fact, it will abort on an invalid limit. + testTransform("{a: {$slice: [-10, 10]}}", "{}", "{a: [4, 6, 8]}", true, "{a: [4, 6, 8]}"); + testTransform("{a: {$slice: [-3, 5]}}", "{}", "{a: [4, 6, 8]}", true, "{a: [4, 6, 8]}"); + testTransform("{a: {$slice: [-1, 1]}}", "{}", "{a: [4, 6, 8]}", true, "{a: [8]}"); + testTransform("{a: {$slice: [0, 2]}}", "{}", "{a: [4, 6, 8]}", true, "{a: [4, 6]}"); + testTransform("{a: {$slice: [0, 1]}}", "{}", "{a: [4, 6, 8]}", true, "{a: [4]}"); + testTransform("{a: {$slice: [1, 1]}}", "{}", "{a: [4, 6, 8]}", true, "{a: [6]}"); + testTransform("{a: {$slice: [3, 5]}}", "{}", "{a: [4, 6, 8]}", true, "{a: []}"); + testTransform("{a: {$slice: [10, 10]}}", "{}", "{a: [4, 6, 8]}", true, "{a: []}"); +} + +// +// $meta +// $meta projections add computed values to the projected object. +// + +TEST(ProjectionExecTest, TransformMetaTextScore) { + // Query {} is ignored. + testTransform("{b: {$meta: 'textScore'}}", + "{}", + "{a: 'hello'}", + new mongo::TextScoreComputedData(100), + true, + "{a: 'hello', b: 100}"); + // Projected meta field should overwrite existing field. + testTransform("{b: {$meta: 'textScore'}}", + "{}", + "{a: 'hello', b: -1}", + new mongo::TextScoreComputedData(100), + true, + "{a: 'hello', b: 100}"); +} } // namespace diff --git a/src/mongo/db/exec/queued_data_stage.cpp b/src/mongo/db/exec/queued_data_stage.cpp index 740f3084740..1fffe7aba86 100644 --- a/src/mongo/db/exec/queued_data_stage.cpp +++ b/src/mongo/db/exec/queued_data_stage.cpp @@ -33,85 +33,87 @@ namespace mongo { - using std::unique_ptr; - using std::vector; +using std::unique_ptr; +using std::vector; - const char* QueuedDataStage::kStageType = "QUEUED_DATA"; +const char* QueuedDataStage::kStageType = "QUEUED_DATA"; - QueuedDataStage::QueuedDataStage(WorkingSet* ws) - : _ws(ws), - _commonStats(kStageType) - {} +QueuedDataStage::QueuedDataStage(WorkingSet* ws) : _ws(ws), _commonStats(kStageType) {} - PlanStage::StageState QueuedDataStage::work(WorkingSetID* out) { - ++_commonStats.works; +PlanStage::StageState QueuedDataStage::work(WorkingSetID* out) { + ++_commonStats.works; - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); - if (isEOF()) { return PlanStage::IS_EOF; } - - StageState state = _results.front(); - _results.pop(); + if (isEOF()) { + return PlanStage::IS_EOF; + } - if (PlanStage::ADVANCED == state) { - ++_commonStats.advanced; - *out = _members.front(); - _members.pop(); - } - else if (PlanStage::NEED_TIME == state) { - ++_commonStats.needTime; - } + StageState state = _results.front(); + _results.pop(); - return state; + if (PlanStage::ADVANCED == state) { + ++_commonStats.advanced; + *out = _members.front(); + _members.pop(); + } else if (PlanStage::NEED_TIME == state) { + ++_commonStats.needTime; } - bool QueuedDataStage::isEOF() { return _results.empty(); } + return state; +} - void QueuedDataStage::saveState() { - ++_commonStats.yields; - } +bool QueuedDataStage::isEOF() { + return _results.empty(); +} - void QueuedDataStage::restoreState(OperationContext* opCtx) { - ++_commonStats.unyields; - } +void QueuedDataStage::saveState() { + ++_commonStats.yields; +} - void QueuedDataStage::invalidate(OperationContext* txn, - const RecordId& dl, - InvalidationType type) { - ++_commonStats.invalidates; - } +void QueuedDataStage::restoreState(OperationContext* opCtx) { + ++_commonStats.unyields; +} - PlanStageStats* QueuedDataStage::getStats() { - _commonStats.isEOF = isEOF(); - unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_QUEUED_DATA)); - ret->specific.reset(new MockStats(_specificStats)); - return ret.release(); - } +void QueuedDataStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + ++_commonStats.invalidates; +} - const CommonStats* QueuedDataStage::getCommonStats() const { return &_commonStats; } +PlanStageStats* QueuedDataStage::getStats() { + _commonStats.isEOF = isEOF(); + unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_QUEUED_DATA)); + ret->specific.reset(new MockStats(_specificStats)); + return ret.release(); +} - const SpecificStats* QueuedDataStage::getSpecificStats() const { return &_specificStats; } +const CommonStats* QueuedDataStage::getCommonStats() const { + return &_commonStats; +} - void QueuedDataStage::pushBack(const PlanStage::StageState state) { - invariant(PlanStage::ADVANCED != state); - _results.push(state); - } +const SpecificStats* QueuedDataStage::getSpecificStats() const { + return &_specificStats; +} - void QueuedDataStage::pushBack(const WorkingSetMember& member) { - _results.push(PlanStage::ADVANCED); +void QueuedDataStage::pushBack(const PlanStage::StageState state) { + invariant(PlanStage::ADVANCED != state); + _results.push(state); +} - WorkingSetID id = _ws->allocate(); - WorkingSetMember* ourMember = _ws->get(id); - WorkingSetCommon::initFrom(ourMember, member); +void QueuedDataStage::pushBack(const WorkingSetMember& member) { + _results.push(PlanStage::ADVANCED); - // member lives in _ws. We'll return it when _results hits ADVANCED. - _members.push(id); - } + WorkingSetID id = _ws->allocate(); + WorkingSetMember* ourMember = _ws->get(id); + WorkingSetCommon::initFrom(ourMember, member); - vector<PlanStage*> QueuedDataStage::getChildren() const { - vector<PlanStage*> empty; - return empty; - } + // member lives in _ws. We'll return it when _results hits ADVANCED. + _members.push(id); +} + +vector<PlanStage*> QueuedDataStage::getChildren() const { + vector<PlanStage*> empty; + return empty; +} } // namespace mongo diff --git a/src/mongo/db/exec/queued_data_stage.h b/src/mongo/db/exec/queued_data_stage.h index 5d7a7b2b159..89185f6d751 100644 --- a/src/mongo/db/exec/queued_data_stage.h +++ b/src/mongo/db/exec/queued_data_stage.h @@ -35,78 +35,80 @@ namespace mongo { - class RecordId; +class RecordId; - /** - * QueuedDataStage is a data-producing stage. Unlike the other two leaf stages (CollectionScan - * and IndexScan) QueuedDataStage does not require any underlying storage layer. - * - * A QueuedDataStage is "programmed" by pushing return values from work() onto its internal - * queue. Calls to QueuedDataStage::work() pop values off that queue and return them in FIFO - * order, annotating the working set with data when appropriate. - */ - class QueuedDataStage : public PlanStage { - public: - QueuedDataStage(WorkingSet* ws); - virtual ~QueuedDataStage() { } +/** + * QueuedDataStage is a data-producing stage. Unlike the other two leaf stages (CollectionScan + * and IndexScan) QueuedDataStage does not require any underlying storage layer. + * + * A QueuedDataStage is "programmed" by pushing return values from work() onto its internal + * queue. Calls to QueuedDataStage::work() pop values off that queue and return them in FIFO + * order, annotating the working set with data when appropriate. + */ +class QueuedDataStage : public PlanStage { +public: + QueuedDataStage(WorkingSet* ws); + virtual ~QueuedDataStage() {} - virtual StageState work(WorkingSetID* out); + virtual StageState work(WorkingSetID* out); - virtual bool isEOF(); + virtual bool isEOF(); - // These don't really mean anything here. - // Some day we could count the # of calls to the yield functions to check that other stages - // have correct yielding behavior. - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + // These don't really mean anything here. + // Some day we could count the # of calls to the yield functions to check that other stages + // have correct yielding behavior. + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - virtual std::vector<PlanStage*> getChildren() const; + virtual std::vector<PlanStage*> getChildren() const; - virtual StageType stageType() const { return STAGE_QUEUED_DATA; } + virtual StageType stageType() const { + return STAGE_QUEUED_DATA; + } - // - // Exec stats - // + // + // Exec stats + // - virtual PlanStageStats* getStats(); + virtual PlanStageStats* getStats(); - virtual const CommonStats* getCommonStats() const; + virtual const CommonStats* getCommonStats() const; - virtual const SpecificStats* getSpecificStats() const; + virtual const SpecificStats* getSpecificStats() const; - /** - * Add a result to the back of the queue. - * - * Note: do not add PlanStage::ADVANCED with this method, ADVANCED can - * only be added with a data member. - * - * Work() goes through the queue. - * Either no data is returned (just a state), or... - */ - void pushBack(const PlanStage::StageState state); + /** + * Add a result to the back of the queue. + * + * Note: do not add PlanStage::ADVANCED with this method, ADVANCED can + * only be added with a data member. + * + * Work() goes through the queue. + * Either no data is returned (just a state), or... + */ + void pushBack(const PlanStage::StageState state); - /** - * ...data is returned (and we ADVANCED) - * - * Allocates a new member and copies 'member' into it. - * Does not take ownership of anything in 'member'. - */ - void pushBack(const WorkingSetMember& member); + /** + * ...data is returned (and we ADVANCED) + * + * Allocates a new member and copies 'member' into it. + * Does not take ownership of anything in 'member'. + */ + void pushBack(const WorkingSetMember& member); - static const char* kStageType; + static const char* kStageType; - private: - // We don't own this. - WorkingSet* _ws; +private: + // We don't own this. + WorkingSet* _ws; - // The data we return. - std::queue<PlanStage::StageState> _results; - std::queue<WorkingSetID> _members; + // The data we return. + std::queue<PlanStage::StageState> _results; + std::queue<WorkingSetID> _members; - // Stats - CommonStats _commonStats; - MockStats _specificStats; - }; + // Stats + CommonStats _commonStats; + MockStats _specificStats; +}; } // namespace mongo diff --git a/src/mongo/db/exec/queued_data_stage_test.cpp b/src/mongo/db/exec/queued_data_stage_test.cpp index ef823c04cfb..45e4a6b2a96 100644 --- a/src/mongo/db/exec/queued_data_stage_test.cpp +++ b/src/mongo/db/exec/queued_data_stage_test.cpp @@ -38,70 +38,70 @@ using namespace mongo; namespace { - using std::unique_ptr; +using std::unique_ptr; - // - // Basic test that we get out valid stats objects. - // - TEST(QueuedDataStageTest, getValidStats) { - WorkingSet ws; - unique_ptr<QueuedDataStage> mock(new QueuedDataStage(&ws)); - const CommonStats* commonStats = mock->getCommonStats(); - ASSERT_EQUALS(commonStats->works, static_cast<size_t>(0)); - const SpecificStats* specificStats = mock->getSpecificStats(); - ASSERT(specificStats); - unique_ptr<PlanStageStats> allStats(mock->getStats()); - ASSERT_EQUALS(allStats->stageType, mock->stageType()); - } +// +// Basic test that we get out valid stats objects. +// +TEST(QueuedDataStageTest, getValidStats) { + WorkingSet ws; + unique_ptr<QueuedDataStage> mock(new QueuedDataStage(&ws)); + const CommonStats* commonStats = mock->getCommonStats(); + ASSERT_EQUALS(commonStats->works, static_cast<size_t>(0)); + const SpecificStats* specificStats = mock->getSpecificStats(); + ASSERT(specificStats); + unique_ptr<PlanStageStats> allStats(mock->getStats()); + ASSERT_EQUALS(allStats->stageType, mock->stageType()); +} - // - // Test that our stats are updated as we perform operations. - // - TEST(QueuedDataStageTest, validateStats) { - WorkingSet ws; - WorkingSetID wsID; - unique_ptr<QueuedDataStage> mock(new QueuedDataStage(&ws)); +// +// Test that our stats are updated as we perform operations. +// +TEST(QueuedDataStageTest, validateStats) { + WorkingSet ws; + WorkingSetID wsID; + unique_ptr<QueuedDataStage> mock(new QueuedDataStage(&ws)); - // make sure that we're at all zero - const CommonStats* stats = mock->getCommonStats(); - ASSERT_EQUALS(stats->yields, 0U); - ASSERT_EQUALS(stats->unyields, 0U); - ASSERT_EQUALS(stats->invalidates, 0U); - ASSERT_EQUALS(stats->works, 0U); - ASSERT_EQUALS(stats->needTime, 0U); - ASSERT_EQUALS(stats->advanced, 0U); - ASSERT_FALSE(stats->isEOF); + // make sure that we're at all zero + const CommonStats* stats = mock->getCommonStats(); + ASSERT_EQUALS(stats->yields, 0U); + ASSERT_EQUALS(stats->unyields, 0U); + ASSERT_EQUALS(stats->invalidates, 0U); + ASSERT_EQUALS(stats->works, 0U); + ASSERT_EQUALS(stats->needTime, 0U); + ASSERT_EQUALS(stats->advanced, 0U); + ASSERT_FALSE(stats->isEOF); - // 'perform' some operations, validate stats - // needTime - mock->pushBack(PlanStage::NEED_TIME); - mock->work(&wsID); - ASSERT_EQUALS(stats->works, 1U); - ASSERT_EQUALS(stats->needTime, 1U); + // 'perform' some operations, validate stats + // needTime + mock->pushBack(PlanStage::NEED_TIME); + mock->work(&wsID); + ASSERT_EQUALS(stats->works, 1U); + ASSERT_EQUALS(stats->needTime, 1U); - // advanced, with pushed data - const WorkingSetMember member; - mock->pushBack(member); - mock->work(&wsID); - ASSERT_EQUALS(stats->works, 2U); - ASSERT_EQUALS(stats->advanced, 1U); + // advanced, with pushed data + const WorkingSetMember member; + mock->pushBack(member); + mock->work(&wsID); + ASSERT_EQUALS(stats->works, 2U); + ASSERT_EQUALS(stats->advanced, 1U); - // yields - mock->saveState(); - ASSERT_EQUALS(stats->yields, 1U); + // yields + mock->saveState(); + ASSERT_EQUALS(stats->yields, 1U); - // unyields - mock->restoreState(NULL); - ASSERT_EQUALS(stats->unyields, 1U); + // unyields + mock->restoreState(NULL); + ASSERT_EQUALS(stats->unyields, 1U); - // invalidates - const RecordId dl(0, 0); - mock->invalidate(NULL, dl, INVALIDATION_MUTATION); - ASSERT_EQUALS(stats->invalidates, 1U); + // invalidates + const RecordId dl(0, 0); + mock->invalidate(NULL, dl, INVALIDATION_MUTATION); + ASSERT_EQUALS(stats->invalidates, 1U); - // and now we are d1U, but must trigger EOF with getStats() - ASSERT_FALSE(stats->isEOF); - unique_ptr<PlanStageStats> allStats(mock->getStats()); - ASSERT_TRUE(stats->isEOF); - } + // and now we are d1U, but must trigger EOF with getStats() + ASSERT_FALSE(stats->isEOF); + unique_ptr<PlanStageStats> allStats(mock->getStats()); + ASSERT_TRUE(stats->isEOF); +} } diff --git a/src/mongo/db/exec/scoped_timer.cpp b/src/mongo/db/exec/scoped_timer.cpp index 9cf6f60ebad..e1db4a44ff8 100644 --- a/src/mongo/db/exec/scoped_timer.cpp +++ b/src/mongo/db/exec/scoped_timer.cpp @@ -34,14 +34,12 @@ namespace mongo { - ScopedTimer::ScopedTimer(long long* counter) : - _counter(counter), - _start(Listener::getElapsedTimeMillis()) { - } +ScopedTimer::ScopedTimer(long long* counter) + : _counter(counter), _start(Listener::getElapsedTimeMillis()) {} - ScopedTimer::~ScopedTimer() { - long long elapsed = Listener::getElapsedTimeMillis() - _start; - *_counter += elapsed; - } +ScopedTimer::~ScopedTimer() { + long long elapsed = Listener::getElapsedTimeMillis() - _start; + *_counter += elapsed; +} } // namespace mongo diff --git a/src/mongo/db/exec/scoped_timer.h b/src/mongo/db/exec/scoped_timer.h index fa4c0d1f5c8..3e1c29fe719 100644 --- a/src/mongo/db/exec/scoped_timer.h +++ b/src/mongo/db/exec/scoped_timer.h @@ -32,26 +32,27 @@ namespace mongo { - /** - * This class increments a counter by a rough estimate of the time elapsed since its - * construction when it goes out of scope. - */ - class ScopedTimer { - MONGO_DISALLOW_COPYING(ScopedTimer); - public: - ScopedTimer(long long* counter); - - ~ScopedTimer(); - - private: - // Default constructor disallowed. - ScopedTimer(); - - // Reference to the counter that we are incrementing with the elapsed time. - long long* _counter; - - // Time at which the timer was constructed. - long long _start; - }; +/** + * This class increments a counter by a rough estimate of the time elapsed since its + * construction when it goes out of scope. + */ +class ScopedTimer { + MONGO_DISALLOW_COPYING(ScopedTimer); + +public: + ScopedTimer(long long* counter); + + ~ScopedTimer(); + +private: + // Default constructor disallowed. + ScopedTimer(); + + // Reference to the counter that we are incrementing with the elapsed time. + long long* _counter; + + // Time at which the timer was constructed. + long long _start; +}; } // namespace mongo diff --git a/src/mongo/db/exec/shard_filter.cpp b/src/mongo/db/exec/shard_filter.cpp index 2727b1c99f8..8f2f3005df1 100644 --- a/src/mongo/db/exec/shard_filter.cpp +++ b/src/mongo/db/exec/shard_filter.cpp @@ -38,130 +38,128 @@ namespace mongo { - using std::unique_ptr; - using std::vector; +using std::unique_ptr; +using std::vector; - // static - const char* ShardFilterStage::kStageType = "SHARDING_FILTER"; +// static +const char* ShardFilterStage::kStageType = "SHARDING_FILTER"; - ShardFilterStage::ShardFilterStage(const CollectionMetadataPtr& metadata, - WorkingSet* ws, - PlanStage* child) - : _ws(ws), _child(child), _commonStats(kStageType), _metadata(metadata) { } +ShardFilterStage::ShardFilterStage(const CollectionMetadataPtr& metadata, + WorkingSet* ws, + PlanStage* child) + : _ws(ws), _child(child), _commonStats(kStageType), _metadata(metadata) {} - ShardFilterStage::~ShardFilterStage() { } +ShardFilterStage::~ShardFilterStage() {} - bool ShardFilterStage::isEOF() { return _child->isEOF(); } +bool ShardFilterStage::isEOF() { + return _child->isEOF(); +} - PlanStage::StageState ShardFilterStage::work(WorkingSetID* out) { - ++_commonStats.works; +PlanStage::StageState ShardFilterStage::work(WorkingSetID* out) { + ++_commonStats.works; - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); - // If we've returned as many results as we're limited to, isEOF will be true. - if (isEOF()) { return PlanStage::IS_EOF; } - - StageState status = _child->work(out); - - if (PlanStage::ADVANCED == status) { - - // If we're sharded make sure that we don't return data that is not owned by us, - // including pending documents from in-progress migrations and orphaned documents from - // aborted migrations - if (_metadata) { - - ShardKeyPattern shardKeyPattern(_metadata->getKeyPattern()); - WorkingSetMember* member = _ws->get(*out); - WorkingSetMatchableDocument matchable(member); - BSONObj shardKey = shardKeyPattern.extractShardKeyFromMatchable(matchable); - - if (shardKey.isEmpty()) { - - // We can't find a shard key for this document - this should never happen with - // a non-fetched result unless our query planning is screwed up - if (!member->hasObj()) { - - Status status(ErrorCodes::InternalError, - "shard key not found after a covered stage, " - "query planning has failed"); - - // Fail loudly and cleanly in production, fatally in debug - error() << status.toString(); - dassert(false); - - _ws->free(*out); - *out = WorkingSetCommon::allocateStatusMember(_ws, status); - return PlanStage::FAILURE; - } + // If we've returned as many results as we're limited to, isEOF will be true. + if (isEOF()) { + return PlanStage::IS_EOF; + } - // Skip this document with a warning - no shard key should not be possible - // unless manually inserting data into a shard - warning() << "no shard key found in document " - << member->obj.value().toString() << " " - << "for shard key pattern " << _metadata->getKeyPattern() << ", " - << "document may have been inserted manually into shard"; - } + StageState status = _child->work(out); + + if (PlanStage::ADVANCED == status) { + // If we're sharded make sure that we don't return data that is not owned by us, + // including pending documents from in-progress migrations and orphaned documents from + // aborted migrations + if (_metadata) { + ShardKeyPattern shardKeyPattern(_metadata->getKeyPattern()); + WorkingSetMember* member = _ws->get(*out); + WorkingSetMatchableDocument matchable(member); + BSONObj shardKey = shardKeyPattern.extractShardKeyFromMatchable(matchable); + + if (shardKey.isEmpty()) { + // We can't find a shard key for this document - this should never happen with + // a non-fetched result unless our query planning is screwed up + if (!member->hasObj()) { + Status status(ErrorCodes::InternalError, + "shard key not found after a covered stage, " + "query planning has failed"); + + // Fail loudly and cleanly in production, fatally in debug + error() << status.toString(); + dassert(false); - if (!_metadata->keyBelongsToMe(shardKey)) { _ws->free(*out); - ++_specificStats.chunkSkips; - return PlanStage::NEED_TIME; + *out = WorkingSetCommon::allocateStatusMember(_ws, status); + return PlanStage::FAILURE; } + + // Skip this document with a warning - no shard key should not be possible + // unless manually inserting data into a shard + warning() << "no shard key found in document " << member->obj.value().toString() + << " " + << "for shard key pattern " << _metadata->getKeyPattern() << ", " + << "document may have been inserted manually into shard"; } - // If we're here either we have shard state and our doc passed, or we have no shard - // state. Either way, we advance. - ++_commonStats.advanced; - return status; - } - else if (PlanStage::NEED_TIME == status) { - ++_commonStats.needTime; - } - else if (PlanStage::NEED_YIELD == status) { - ++_commonStats.needYield; + if (!_metadata->keyBelongsToMe(shardKey)) { + _ws->free(*out); + ++_specificStats.chunkSkips; + return PlanStage::NEED_TIME; + } } + // If we're here either we have shard state and our doc passed, or we have no shard + // state. Either way, we advance. + ++_commonStats.advanced; return status; + } else if (PlanStage::NEED_TIME == status) { + ++_commonStats.needTime; + } else if (PlanStage::NEED_YIELD == status) { + ++_commonStats.needYield; } - void ShardFilterStage::saveState() { - ++_commonStats.yields; - _child->saveState(); - } - - void ShardFilterStage::restoreState(OperationContext* opCtx) { - ++_commonStats.unyields; - _child->restoreState(opCtx); - } - - void ShardFilterStage::invalidate(OperationContext* txn, - const RecordId& dl, - InvalidationType type) { - ++_commonStats.invalidates; - _child->invalidate(txn, dl, type); - } - - vector<PlanStage*> ShardFilterStage::getChildren() const { - vector<PlanStage*> children; - children.push_back(_child.get()); - return children; - } - - PlanStageStats* ShardFilterStage::getStats() { - _commonStats.isEOF = isEOF(); - unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_SHARDING_FILTER)); - ret->children.push_back(_child->getStats()); - ret->specific.reset(new ShardingFilterStats(_specificStats)); - return ret.release(); - } - - const CommonStats* ShardFilterStage::getCommonStats() const { - return &_commonStats; - } - - const SpecificStats* ShardFilterStage::getSpecificStats() const { - return &_specificStats; - } + return status; +} + +void ShardFilterStage::saveState() { + ++_commonStats.yields; + _child->saveState(); +} + +void ShardFilterStage::restoreState(OperationContext* opCtx) { + ++_commonStats.unyields; + _child->restoreState(opCtx); +} + +void ShardFilterStage::invalidate(OperationContext* txn, + const RecordId& dl, + InvalidationType type) { + ++_commonStats.invalidates; + _child->invalidate(txn, dl, type); +} + +vector<PlanStage*> ShardFilterStage::getChildren() const { + vector<PlanStage*> children; + children.push_back(_child.get()); + return children; +} + +PlanStageStats* ShardFilterStage::getStats() { + _commonStats.isEOF = isEOF(); + unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_SHARDING_FILTER)); + ret->children.push_back(_child->getStats()); + ret->specific.reset(new ShardingFilterStats(_specificStats)); + return ret.release(); +} + +const CommonStats* ShardFilterStage::getCommonStats() const { + return &_commonStats; +} + +const SpecificStats* ShardFilterStage::getSpecificStats() const { + return &_specificStats; +} } // namespace mongo diff --git a/src/mongo/db/exec/shard_filter.h b/src/mongo/db/exec/shard_filter.h index 0703522b525..07b5d000bbb 100644 --- a/src/mongo/db/exec/shard_filter.h +++ b/src/mongo/db/exec/shard_filter.h @@ -36,76 +36,78 @@ namespace mongo { - /** - * This stage drops documents that didn't belong to the shard we're executing on at the time of - * construction. This matches the contract for sharded cursorids which guarantees that a - * StaleConfigException will be thrown early or the cursorid for its entire lifetime will return - * documents matching the shard version set on the connection at the time of cursorid creation. - * - * A related system will ensure that the data migrated away from a shard will not be deleted as - * long as there are active queries from before the migration. Currently, "active queries" is - * defined by cursorids so it is important that the metadata used in this stage uses the same - * version as the cursorid. Therefore, you must wrap any Runner using this Stage in a - * ClientCursor during the same lock grab as constructing the Runner. - * - * BEGIN NOTE FROM GREG - * - * There are three sharded query contracts: - * - * 0) Migration commit takes the db lock - i.e. is serialized with writes and reads. - * 1) No data should be returned from a query in ranges of migrations that committed after the - * query started, or from ranges not owned when the query began. - * 2) No migrated data should be removed from a shard while there are queries that were active - * before the migration. - * - * As implementation details, collection metadata is used to determine the ranges of all data - * not actively migrated (or orphaned). CursorIds are currently used to establish "active" - * queries before migration commit. - * - * Combining all this: if a query is started in a db lock and acquires in that (same) lock the - * collection metadata and a cursorId, the query will return results for exactly the ranges in - * the metadata (though of arbitrary staleness). This is the sharded collection query contract. - * - * END NOTE FROM GREG - * - * Preconditions: Child must be fetched. TODO: when covering analysis is in just build doc - * and check that against shard key. See SERVER-5022. - */ - class ShardFilterStage : public PlanStage { - public: - ShardFilterStage(const CollectionMetadataPtr& metadata, WorkingSet* ws, PlanStage* child); - virtual ~ShardFilterStage(); - - virtual bool isEOF(); - virtual StageState work(WorkingSetID* out); - - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - - virtual std::vector<PlanStage*> getChildren() const; - - virtual StageType stageType() const { return STAGE_SHARDING_FILTER; } - - virtual PlanStageStats* getStats(); - - virtual const CommonStats* getCommonStats() const; - - virtual const SpecificStats* getSpecificStats() const; - - static const char* kStageType; - - private: - WorkingSet* _ws; - std::unique_ptr<PlanStage> _child; - - // Stats - CommonStats _commonStats; - ShardingFilterStats _specificStats; - - // Note: it is important that this is the metadata from the time this stage is constructed. - // See class comment for details. - const CollectionMetadataPtr _metadata; - }; +/** + * This stage drops documents that didn't belong to the shard we're executing on at the time of + * construction. This matches the contract for sharded cursorids which guarantees that a + * StaleConfigException will be thrown early or the cursorid for its entire lifetime will return + * documents matching the shard version set on the connection at the time of cursorid creation. + * + * A related system will ensure that the data migrated away from a shard will not be deleted as + * long as there are active queries from before the migration. Currently, "active queries" is + * defined by cursorids so it is important that the metadata used in this stage uses the same + * version as the cursorid. Therefore, you must wrap any Runner using this Stage in a + * ClientCursor during the same lock grab as constructing the Runner. + * + * BEGIN NOTE FROM GREG + * + * There are three sharded query contracts: + * + * 0) Migration commit takes the db lock - i.e. is serialized with writes and reads. + * 1) No data should be returned from a query in ranges of migrations that committed after the + * query started, or from ranges not owned when the query began. + * 2) No migrated data should be removed from a shard while there are queries that were active + * before the migration. + * + * As implementation details, collection metadata is used to determine the ranges of all data + * not actively migrated (or orphaned). CursorIds are currently used to establish "active" + * queries before migration commit. + * + * Combining all this: if a query is started in a db lock and acquires in that (same) lock the + * collection metadata and a cursorId, the query will return results for exactly the ranges in + * the metadata (though of arbitrary staleness). This is the sharded collection query contract. + * + * END NOTE FROM GREG + * + * Preconditions: Child must be fetched. TODO: when covering analysis is in just build doc + * and check that against shard key. See SERVER-5022. + */ +class ShardFilterStage : public PlanStage { +public: + ShardFilterStage(const CollectionMetadataPtr& metadata, WorkingSet* ws, PlanStage* child); + virtual ~ShardFilterStage(); + + virtual bool isEOF(); + virtual StageState work(WorkingSetID* out); + + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + + virtual std::vector<PlanStage*> getChildren() const; + + virtual StageType stageType() const { + return STAGE_SHARDING_FILTER; + } + + virtual PlanStageStats* getStats(); + + virtual const CommonStats* getCommonStats() const; + + virtual const SpecificStats* getSpecificStats() const; + + static const char* kStageType; + +private: + WorkingSet* _ws; + std::unique_ptr<PlanStage> _child; + + // Stats + CommonStats _commonStats; + ShardingFilterStats _specificStats; + + // Note: it is important that this is the metadata from the time this stage is constructed. + // See class comment for details. + const CollectionMetadataPtr _metadata; +}; } // namespace mongo diff --git a/src/mongo/db/exec/skip.cpp b/src/mongo/db/exec/skip.cpp index 979e952b1d7..33e66178af0 100644 --- a/src/mongo/db/exec/skip.cpp +++ b/src/mongo/db/exec/skip.cpp @@ -33,103 +33,102 @@ namespace mongo { - using std::unique_ptr; - using std::vector; +using std::unique_ptr; +using std::vector; - // static - const char* SkipStage::kStageType = "SKIP"; +// static +const char* SkipStage::kStageType = "SKIP"; - SkipStage::SkipStage(int toSkip, WorkingSet* ws, PlanStage* child) - : _ws(ws), _child(child), _toSkip(toSkip), _commonStats(kStageType) { } +SkipStage::SkipStage(int toSkip, WorkingSet* ws, PlanStage* child) + : _ws(ws), _child(child), _toSkip(toSkip), _commonStats(kStageType) {} - SkipStage::~SkipStage() { } +SkipStage::~SkipStage() {} - bool SkipStage::isEOF() { return _child->isEOF(); } +bool SkipStage::isEOF() { + return _child->isEOF(); +} - PlanStage::StageState SkipStage::work(WorkingSetID* out) { - ++_commonStats.works; +PlanStage::StageState SkipStage::work(WorkingSetID* out) { + ++_commonStats.works; - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); - WorkingSetID id = WorkingSet::INVALID_ID; - StageState status = _child->work(&id); + WorkingSetID id = WorkingSet::INVALID_ID; + StageState status = _child->work(&id); - if (PlanStage::ADVANCED == status) { - // If we're still skipping results... - if (_toSkip > 0) { - // ...drop the result. - --_toSkip; - _ws->free(id); - ++_commonStats.needTime; - return PlanStage::NEED_TIME; - } - - *out = id; - ++_commonStats.advanced; - return PlanStage::ADVANCED; - } - else if (PlanStage::FAILURE == status || PlanStage::DEAD == status) { - *out = id; - // If a stage fails, it may create a status WSM to indicate why it - // failed, in which case 'id' is valid. If ID is invalid, we - // create our own error message. - if (WorkingSet::INVALID_ID == id) { - mongoutils::str::stream ss; - ss << "skip stage failed to read in results from child"; - Status status(ErrorCodes::InternalError, ss); - *out = WorkingSetCommon::allocateStatusMember( _ws, status); - } - return status; - } - else if (PlanStage::NEED_TIME == status) { + if (PlanStage::ADVANCED == status) { + // If we're still skipping results... + if (_toSkip > 0) { + // ...drop the result. + --_toSkip; + _ws->free(id); ++_commonStats.needTime; - } - else if (PlanStage::NEED_YIELD == status) { - ++_commonStats.needYield; - *out = id; + return PlanStage::NEED_TIME; } - // NEED_TIME, NEED_YIELD, ERROR, IS_EOF + *out = id; + ++_commonStats.advanced; + return PlanStage::ADVANCED; + } else if (PlanStage::FAILURE == status || PlanStage::DEAD == status) { + *out = id; + // If a stage fails, it may create a status WSM to indicate why it + // failed, in which case 'id' is valid. If ID is invalid, we + // create our own error message. + if (WorkingSet::INVALID_ID == id) { + mongoutils::str::stream ss; + ss << "skip stage failed to read in results from child"; + Status status(ErrorCodes::InternalError, ss); + *out = WorkingSetCommon::allocateStatusMember(_ws, status); + } return status; + } else if (PlanStage::NEED_TIME == status) { + ++_commonStats.needTime; + } else if (PlanStage::NEED_YIELD == status) { + ++_commonStats.needYield; + *out = id; } - void SkipStage::saveState() { - ++_commonStats.yields; - _child->saveState(); - } - - void SkipStage::restoreState(OperationContext* opCtx) { - ++_commonStats.unyields; - _child->restoreState(opCtx); - } - - void SkipStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { - ++_commonStats.invalidates; - _child->invalidate(txn, dl, type); - } - - vector<PlanStage*> SkipStage::getChildren() const { - vector<PlanStage*> children; - children.push_back(_child.get()); - return children; - } - - PlanStageStats* SkipStage::getStats() { - _commonStats.isEOF = isEOF(); - _specificStats.skip = _toSkip; - unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_SKIP)); - ret->specific.reset(new SkipStats(_specificStats)); - ret->children.push_back(_child->getStats()); - return ret.release(); - } - - const CommonStats* SkipStage::getCommonStats() const { - return &_commonStats; - } - - const SpecificStats* SkipStage::getSpecificStats() const { - return &_specificStats; - } + // NEED_TIME, NEED_YIELD, ERROR, IS_EOF + return status; +} + +void SkipStage::saveState() { + ++_commonStats.yields; + _child->saveState(); +} + +void SkipStage::restoreState(OperationContext* opCtx) { + ++_commonStats.unyields; + _child->restoreState(opCtx); +} + +void SkipStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + ++_commonStats.invalidates; + _child->invalidate(txn, dl, type); +} + +vector<PlanStage*> SkipStage::getChildren() const { + vector<PlanStage*> children; + children.push_back(_child.get()); + return children; +} + +PlanStageStats* SkipStage::getStats() { + _commonStats.isEOF = isEOF(); + _specificStats.skip = _toSkip; + unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_SKIP)); + ret->specific.reset(new SkipStats(_specificStats)); + ret->children.push_back(_child->getStats()); + return ret.release(); +} + +const CommonStats* SkipStage::getCommonStats() const { + return &_commonStats; +} + +const SpecificStats* SkipStage::getSpecificStats() const { + return &_specificStats; +} } // namespace mongo diff --git a/src/mongo/db/exec/skip.h b/src/mongo/db/exec/skip.h index 77bdd5786a5..f03d0135186 100644 --- a/src/mongo/db/exec/skip.h +++ b/src/mongo/db/exec/skip.h @@ -35,46 +35,48 @@ namespace mongo { - /** - * This stage implements skip functionality. It drops the first 'toSkip' results from its child - * then returns the rest verbatim. - * - * Preconditions: None. - */ - class SkipStage : public PlanStage { - public: - SkipStage(int toSkip, WorkingSet* ws, PlanStage* child); - virtual ~SkipStage(); +/** + * This stage implements skip functionality. It drops the first 'toSkip' results from its child + * then returns the rest verbatim. + * + * Preconditions: None. + */ +class SkipStage : public PlanStage { +public: + SkipStage(int toSkip, WorkingSet* ws, PlanStage* child); + virtual ~SkipStage(); - virtual bool isEOF(); - virtual StageState work(WorkingSetID* out); + virtual bool isEOF(); + virtual StageState work(WorkingSetID* out); - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - virtual std::vector<PlanStage*> getChildren() const; + virtual std::vector<PlanStage*> getChildren() const; - virtual StageType stageType() const { return STAGE_SKIP; } + virtual StageType stageType() const { + return STAGE_SKIP; + } - virtual PlanStageStats* getStats(); + virtual PlanStageStats* getStats(); - virtual const CommonStats* getCommonStats() const; + virtual const CommonStats* getCommonStats() const; - virtual const SpecificStats* getSpecificStats() const; + virtual const SpecificStats* getSpecificStats() const; - static const char* kStageType; + static const char* kStageType; - private: - WorkingSet* _ws; - std::unique_ptr<PlanStage> _child; +private: + WorkingSet* _ws; + std::unique_ptr<PlanStage> _child; - // We drop the first _toSkip results that we would have returned. - int _toSkip; + // We drop the first _toSkip results that we would have returned. + int _toSkip; - // Stats - CommonStats _commonStats; - SkipStats _specificStats; - }; + // Stats + CommonStats _commonStats; + SkipStats _specificStats; +}; } // namespace mongo diff --git a/src/mongo/db/exec/sort.cpp b/src/mongo/db/exec/sort.cpp index 61dd09e1dbc..7a98b5113c2 100644 --- a/src/mongo/db/exec/sort.cpp +++ b/src/mongo/db/exec/sort.cpp @@ -45,552 +45,534 @@ namespace mongo { - using std::unique_ptr; - using std::endl; - using std::vector; - - // static - const char* SortStage::kStageType = "SORT"; - - SortStageKeyGenerator::SortStageKeyGenerator(const Collection* collection, - const BSONObj& sortSpec, - const BSONObj& queryObj) { - _collection = collection; - _hasBounds = false; - _sortHasMeta = false; - _rawSortSpec = sortSpec; - - // 'sortSpec' can be a mix of $meta and index key expressions. We pick it apart so that - // we only generate Btree keys for the index key expressions. - - // The Btree key fields go in here. We pass this fake index key pattern to the Btree - // key generator below as part of generating sort keys for the docs. - BSONObjBuilder btreeBob; - - // The pattern we use to woCompare keys. Each field in 'sortSpec' will go in here with - // a value of 1 or -1. The Btree key fields are verbatim, meta fields have a default. - BSONObjBuilder comparatorBob; - - BSONObjIterator it(sortSpec); - while (it.more()) { - BSONElement elt = it.next(); - if (elt.isNumber()) { - // Btree key. elt (should be) foo: 1 or foo: -1. - comparatorBob.append(elt); - btreeBob.append(elt); - } - else if (LiteParsedQuery::isTextScoreMeta(elt)) { - // Sort text score decreasing by default. Field name doesn't matter but we choose - // something that a user shouldn't ever have. - comparatorBob.append("$metaTextScore", -1); - _sortHasMeta = true; - } - else { - // Sort spec. should have been validated before here. - verify(false); - } - } - - // Our pattern for woComparing keys. - _comparatorObj = comparatorBob.obj(); - - // The fake index key pattern used to generate Btree keys. - _btreeObj = btreeBob.obj(); - - // If we're just sorting by meta, don't bother with all the key stuff. - if (_btreeObj.isEmpty()) { - return; +using std::unique_ptr; +using std::endl; +using std::vector; + +// static +const char* SortStage::kStageType = "SORT"; + +SortStageKeyGenerator::SortStageKeyGenerator(const Collection* collection, + const BSONObj& sortSpec, + const BSONObj& queryObj) { + _collection = collection; + _hasBounds = false; + _sortHasMeta = false; + _rawSortSpec = sortSpec; + + // 'sortSpec' can be a mix of $meta and index key expressions. We pick it apart so that + // we only generate Btree keys for the index key expressions. + + // The Btree key fields go in here. We pass this fake index key pattern to the Btree + // key generator below as part of generating sort keys for the docs. + BSONObjBuilder btreeBob; + + // The pattern we use to woCompare keys. Each field in 'sortSpec' will go in here with + // a value of 1 or -1. The Btree key fields are verbatim, meta fields have a default. + BSONObjBuilder comparatorBob; + + BSONObjIterator it(sortSpec); + while (it.more()) { + BSONElement elt = it.next(); + if (elt.isNumber()) { + // Btree key. elt (should be) foo: 1 or foo: -1. + comparatorBob.append(elt); + btreeBob.append(elt); + } else if (LiteParsedQuery::isTextScoreMeta(elt)) { + // Sort text score decreasing by default. Field name doesn't matter but we choose + // something that a user shouldn't ever have. + comparatorBob.append("$metaTextScore", -1); + _sortHasMeta = true; + } else { + // Sort spec. should have been validated before here. + verify(false); } + } - // We'll need to treat arrays as if we were to create an index over them. that is, - // we may need to unnest the first level and consider each array element to decide - // the sort order. - std::vector<const char *> fieldNames; - std::vector<BSONElement> fixed; - BSONObjIterator btreeIt(_btreeObj); - while (btreeIt.more()) { - BSONElement patternElt = btreeIt.next(); - fieldNames.push_back(patternElt.fieldName()); - fixed.push_back(BSONElement()); - } + // Our pattern for woComparing keys. + _comparatorObj = comparatorBob.obj(); - _keyGen.reset(new BtreeKeyGeneratorV1(fieldNames, fixed, false /* not sparse */)); + // The fake index key pattern used to generate Btree keys. + _btreeObj = btreeBob.obj(); - // The bounds checker only works on the Btree part of the sort key. - getBoundsForSort(queryObj, _btreeObj); + // If we're just sorting by meta, don't bother with all the key stuff. + if (_btreeObj.isEmpty()) { + return; + } - if (_hasBounds) { - _boundsChecker.reset(new IndexBoundsChecker(&_bounds, _btreeObj, 1 /* == order */)); - } + // We'll need to treat arrays as if we were to create an index over them. that is, + // we may need to unnest the first level and consider each array element to decide + // the sort order. + std::vector<const char*> fieldNames; + std::vector<BSONElement> fixed; + BSONObjIterator btreeIt(_btreeObj); + while (btreeIt.more()) { + BSONElement patternElt = btreeIt.next(); + fieldNames.push_back(patternElt.fieldName()); + fixed.push_back(BSONElement()); } - Status SortStageKeyGenerator::getSortKey(const WorkingSetMember& member, - BSONObj* objOut) const { - BSONObj btreeKeyToUse; + _keyGen.reset(new BtreeKeyGeneratorV1(fieldNames, fixed, false /* not sparse */)); - Status btreeStatus = getBtreeKey(member.obj.value(), &btreeKeyToUse); - if (!btreeStatus.isOK()) { - return btreeStatus; - } + // The bounds checker only works on the Btree part of the sort key. + getBoundsForSort(queryObj, _btreeObj); - if (!_sortHasMeta) { - *objOut = btreeKeyToUse; - return Status::OK(); - } + if (_hasBounds) { + _boundsChecker.reset(new IndexBoundsChecker(&_bounds, _btreeObj, 1 /* == order */)); + } +} - BSONObjBuilder mergedKeyBob; +Status SortStageKeyGenerator::getSortKey(const WorkingSetMember& member, BSONObj* objOut) const { + BSONObj btreeKeyToUse; - // Merge metadata into the key. - BSONObjIterator it(_rawSortSpec); - BSONObjIterator btreeIt(btreeKeyToUse); - while (it.more()) { - BSONElement elt = it.next(); - if (elt.isNumber()) { - // Merge btree key elt. - mergedKeyBob.append(btreeIt.next()); - } - else if (LiteParsedQuery::isTextScoreMeta(elt)) { - // Add text score metadata - double score = 0.0; - if (member.hasComputed(WSM_COMPUTED_TEXT_SCORE)) { - const TextScoreComputedData* scoreData - = static_cast<const TextScoreComputedData*>( - member.getComputed(WSM_COMPUTED_TEXT_SCORE)); - score = scoreData->getScore(); - } - mergedKeyBob.append("$metaTextScore", score); - } - } + Status btreeStatus = getBtreeKey(member.obj.value(), &btreeKeyToUse); + if (!btreeStatus.isOK()) { + return btreeStatus; + } - *objOut = mergedKeyBob.obj(); + if (!_sortHasMeta) { + *objOut = btreeKeyToUse; return Status::OK(); } - Status SortStageKeyGenerator::getBtreeKey(const BSONObj& memberObj, BSONObj* objOut) const { - // Not sorting by anything in the key, just bail out early. - if (_btreeObj.isEmpty()) { - *objOut = BSONObj(); - return Status::OK(); - } - - // We will sort '_data' in the same order an index over '_pattern' would have. This is - // tricky. Consider the sort pattern {a:1} and the document {a:[1, 10]}. We have - // potentially two keys we could use to sort on. Here we extract these keys. - BSONObjCmp patternCmp(_btreeObj); - BSONObjSet keys(patternCmp); - - try { - _keyGen->getKeys(memberObj, &keys); - } - catch (const UserException& e) { - // Probably a parallel array. - if (BtreeKeyGenerator::ParallelArraysCode == e.getCode()) { - return Status(ErrorCodes::BadValue, - "cannot sort with keys that are parallel arrays"); + BSONObjBuilder mergedKeyBob; + + // Merge metadata into the key. + BSONObjIterator it(_rawSortSpec); + BSONObjIterator btreeIt(btreeKeyToUse); + while (it.more()) { + BSONElement elt = it.next(); + if (elt.isNumber()) { + // Merge btree key elt. + mergedKeyBob.append(btreeIt.next()); + } else if (LiteParsedQuery::isTextScoreMeta(elt)) { + // Add text score metadata + double score = 0.0; + if (member.hasComputed(WSM_COMPUTED_TEXT_SCORE)) { + const TextScoreComputedData* scoreData = static_cast<const TextScoreComputedData*>( + member.getComputed(WSM_COMPUTED_TEXT_SCORE)); + score = scoreData->getScore(); } - else { - return e.toStatus(); - } - } - catch (...) { - return Status(ErrorCodes::InternalError, "unknown error during sort key generation"); + mergedKeyBob.append("$metaTextScore", score); } + } - // Key generator isn't sparse so we should at least get an all-null key. - invariant(!keys.empty()); + *objOut = mergedKeyBob.obj(); + return Status::OK(); +} - // No bounds? No problem! Use the first key. - if (!_hasBounds) { - // Note that we sort 'keys' according to the pattern '_btreeObj'. - *objOut = *keys.begin(); - return Status::OK(); - } +Status SortStageKeyGenerator::getBtreeKey(const BSONObj& memberObj, BSONObj* objOut) const { + // Not sorting by anything in the key, just bail out early. + if (_btreeObj.isEmpty()) { + *objOut = BSONObj(); + return Status::OK(); + } - // To decide which key to use in sorting, we must consider not only the sort pattern but - // the query. Assume we have the query {a: {$gte: 5}} and a document {a:1}. That - // document wouldn't match the query. As such, the key '1' in an array {a: [1, 10]} - // should not be considered as being part of the result set and thus that array cannot - // sort using the key '1'. To ensure that the keys we sort by are valid w.r.t. the - // query we use a bounds checker. - verify(NULL != _boundsChecker.get()); - for (BSONObjSet::const_iterator it = keys.begin(); it != keys.end(); ++it) { - if (_boundsChecker->isValidKey(*it)) { - *objOut = *it; - return Status::OK(); - } + // We will sort '_data' in the same order an index over '_pattern' would have. This is + // tricky. Consider the sort pattern {a:1} and the document {a:[1, 10]}. We have + // potentially two keys we could use to sort on. Here we extract these keys. + BSONObjCmp patternCmp(_btreeObj); + BSONObjSet keys(patternCmp); + + try { + _keyGen->getKeys(memberObj, &keys); + } catch (const UserException& e) { + // Probably a parallel array. + if (BtreeKeyGenerator::ParallelArraysCode == e.getCode()) { + return Status(ErrorCodes::BadValue, "cannot sort with keys that are parallel arrays"); + } else { + return e.toStatus(); } + } catch (...) { + return Status(ErrorCodes::InternalError, "unknown error during sort key generation"); + } + + // Key generator isn't sparse so we should at least get an all-null key. + invariant(!keys.empty()); - // No key is in our bounds. - // TODO: will this ever happen? don't think it should. + // No bounds? No problem! Use the first key. + if (!_hasBounds) { + // Note that we sort 'keys' according to the pattern '_btreeObj'. *objOut = *keys.begin(); return Status::OK(); } - void SortStageKeyGenerator::getBoundsForSort(const BSONObj& queryObj, const BSONObj& sortObj) { - QueryPlannerParams params; - params.options = QueryPlannerParams::NO_TABLE_SCAN; - - // We're creating a "virtual index" with key pattern equal to the sort order. - IndexEntry sortOrder(sortObj, IndexNames::BTREE, true, false, false, "doesnt_matter", NULL, - BSONObj()); - params.indices.push_back(sortOrder); - - CanonicalQuery* rawQueryForSort; - verify(CanonicalQuery::canonicalize( - "fake_ns", queryObj, &rawQueryForSort, WhereCallbackNoop()).isOK()); - unique_ptr<CanonicalQuery> queryForSort(rawQueryForSort); - - vector<QuerySolution*> solns; - LOG(5) << "Sort stage: Planning to obtain bounds for sort." << endl; - QueryPlanner::plan(*queryForSort, params, &solns); - - // TODO: are there ever > 1 solns? If so, do we look for a specific soln? - if (1 == solns.size()) { - IndexScanNode* ixScan = NULL; - QuerySolutionNode* rootNode = solns[0]->root.get(); - - if (rootNode->getType() == STAGE_FETCH) { - FetchNode* fetchNode = static_cast<FetchNode*>(rootNode); - if (fetchNode->children[0]->getType() != STAGE_IXSCAN) { - delete solns[0]; - // No bounds. - return; - } - ixScan = static_cast<IndexScanNode*>(fetchNode->children[0]); - } - else if (rootNode->getType() == STAGE_IXSCAN) { - ixScan = static_cast<IndexScanNode*>(rootNode); - } + // To decide which key to use in sorting, we must consider not only the sort pattern but + // the query. Assume we have the query {a: {$gte: 5}} and a document {a:1}. That + // document wouldn't match the query. As such, the key '1' in an array {a: [1, 10]} + // should not be considered as being part of the result set and thus that array cannot + // sort using the key '1'. To ensure that the keys we sort by are valid w.r.t. the + // query we use a bounds checker. + verify(NULL != _boundsChecker.get()); + for (BSONObjSet::const_iterator it = keys.begin(); it != keys.end(); ++it) { + if (_boundsChecker->isValidKey(*it)) { + *objOut = *it; + return Status::OK(); + } + } - if (ixScan) { - _bounds.fields.swap(ixScan->bounds.fields); - _hasBounds = true; + // No key is in our bounds. + // TODO: will this ever happen? don't think it should. + *objOut = *keys.begin(); + return Status::OK(); +} + +void SortStageKeyGenerator::getBoundsForSort(const BSONObj& queryObj, const BSONObj& sortObj) { + QueryPlannerParams params; + params.options = QueryPlannerParams::NO_TABLE_SCAN; + + // We're creating a "virtual index" with key pattern equal to the sort order. + IndexEntry sortOrder( + sortObj, IndexNames::BTREE, true, false, false, "doesnt_matter", NULL, BSONObj()); + params.indices.push_back(sortOrder); + + CanonicalQuery* rawQueryForSort; + verify(CanonicalQuery::canonicalize("fake_ns", queryObj, &rawQueryForSort, WhereCallbackNoop()) + .isOK()); + unique_ptr<CanonicalQuery> queryForSort(rawQueryForSort); + + vector<QuerySolution*> solns; + LOG(5) << "Sort stage: Planning to obtain bounds for sort." << endl; + QueryPlanner::plan(*queryForSort, params, &solns); + + // TODO: are there ever > 1 solns? If so, do we look for a specific soln? + if (1 == solns.size()) { + IndexScanNode* ixScan = NULL; + QuerySolutionNode* rootNode = solns[0]->root.get(); + + if (rootNode->getType() == STAGE_FETCH) { + FetchNode* fetchNode = static_cast<FetchNode*>(rootNode); + if (fetchNode->children[0]->getType() != STAGE_IXSCAN) { + delete solns[0]; + // No bounds. + return; } + ixScan = static_cast<IndexScanNode*>(fetchNode->children[0]); + } else if (rootNode->getType() == STAGE_IXSCAN) { + ixScan = static_cast<IndexScanNode*>(rootNode); } - for (size_t i = 0; i < solns.size(); ++i) { - delete solns[i]; + if (ixScan) { + _bounds.fields.swap(ixScan->bounds.fields); + _hasBounds = true; } } - SortStage::WorkingSetComparator::WorkingSetComparator(BSONObj p) : pattern(p) { } + for (size_t i = 0; i < solns.size(); ++i) { + delete solns[i]; + } +} + +SortStage::WorkingSetComparator::WorkingSetComparator(BSONObj p) : pattern(p) {} - bool SortStage::WorkingSetComparator::operator()(const SortableDataItem& lhs, const SortableDataItem& rhs) const { - // False means ignore field names. - int result = lhs.sortKey.woCompare(rhs.sortKey, pattern, false); - if (0 != result) { - return result < 0; +bool SortStage::WorkingSetComparator::operator()(const SortableDataItem& lhs, + const SortableDataItem& rhs) const { + // False means ignore field names. + int result = lhs.sortKey.woCompare(rhs.sortKey, pattern, false); + if (0 != result) { + return result < 0; + } + // Indices use RecordId as an additional sort key so we must as well. + return lhs.loc < rhs.loc; +} + +SortStage::SortStage(const SortStageParams& params, WorkingSet* ws, PlanStage* child) + : _collection(params.collection), + _ws(ws), + _child(child), + _pattern(params.pattern), + _query(params.query), + _limit(params.limit), + _sorted(false), + _resultIterator(_data.end()), + _commonStats(kStageType), + _memUsage(0) {} + +SortStage::~SortStage() {} + +bool SortStage::isEOF() { + // We're done when our child has no more results, we've sorted the child's results, and + // we've returned all sorted results. + return _child->isEOF() && _sorted && (_data.end() == _resultIterator); +} + +PlanStage::StageState SortStage::work(WorkingSetID* out) { + ++_commonStats.works; + + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); + + if (NULL == _sortKeyGen) { + // This is heavy and should be done as part of work(). + _sortKeyGen.reset(new SortStageKeyGenerator(_collection, _pattern, _query)); + _sortKeyComparator.reset(new WorkingSetComparator(_sortKeyGen->getSortComparator())); + // If limit > 1, we need to initialize _dataSet here to maintain ordered + // set of data items while fetching from the child stage. + if (_limit > 1) { + const WorkingSetComparator& cmp = *_sortKeyComparator; + _dataSet.reset(new SortableDataItemSet(cmp)); } - // Indices use RecordId as an additional sort key so we must as well. - return lhs.loc < rhs.loc; + return PlanStage::NEED_TIME; } - SortStage::SortStage(const SortStageParams& params, - WorkingSet* ws, - PlanStage* child) - : _collection(params.collection), - _ws(ws), - _child(child), - _pattern(params.pattern), - _query(params.query), - _limit(params.limit), - _sorted(false), - _resultIterator(_data.end()), - _commonStats(kStageType), - _memUsage(0) { + const size_t maxBytes = static_cast<size_t>(internalQueryExecMaxBlockingSortBytes); + if (_memUsage > maxBytes) { + mongoutils::str::stream ss; + ss << "Sort operation used more than the maximum " << maxBytes + << " bytes of RAM. Add an index, or specify a smaller limit."; + Status status(ErrorCodes::OperationFailed, ss); + *out = WorkingSetCommon::allocateStatusMember(_ws, status); + return PlanStage::FAILURE; } - SortStage::~SortStage() { } - - bool SortStage::isEOF() { - // We're done when our child has no more results, we've sorted the child's results, and - // we've returned all sorted results. - return _child->isEOF() && _sorted && (_data.end() == _resultIterator); + if (isEOF()) { + return PlanStage::IS_EOF; } - PlanStage::StageState SortStage::work(WorkingSetID* out) { - ++_commonStats.works; - - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); - - if (NULL == _sortKeyGen) { - // This is heavy and should be done as part of work(). - _sortKeyGen.reset(new SortStageKeyGenerator(_collection, _pattern, _query)); - _sortKeyComparator.reset(new WorkingSetComparator(_sortKeyGen->getSortComparator())); - // If limit > 1, we need to initialize _dataSet here to maintain ordered - // set of data items while fetching from the child stage. - if (_limit > 1) { - const WorkingSetComparator& cmp = *_sortKeyComparator; - _dataSet.reset(new SortableDataItemSet(cmp)); - } - return PlanStage::NEED_TIME; - } + // Still reading in results to sort. + if (!_sorted) { + WorkingSetID id = WorkingSet::INVALID_ID; + StageState code = _child->work(&id); - const size_t maxBytes = static_cast<size_t>(internalQueryExecMaxBlockingSortBytes); - if (_memUsage > maxBytes) { - mongoutils::str::stream ss; - ss << "Sort operation used more than the maximum " << maxBytes - << " bytes of RAM. Add an index, or specify a smaller limit."; - Status status(ErrorCodes::OperationFailed, ss); - *out = WorkingSetCommon::allocateStatusMember( _ws, status); - return PlanStage::FAILURE; - } + if (PlanStage::ADVANCED == code) { + // Add it into the map for quick invalidation if it has a valid RecordId. + // A RecordId may be invalidated at any time (during a yield). We need to get into + // the WorkingSet as quickly as possible to handle it. + WorkingSetMember* member = _ws->get(id); - if (isEOF()) { return PlanStage::IS_EOF; } - - // Still reading in results to sort. - if (!_sorted) { - WorkingSetID id = WorkingSet::INVALID_ID; - StageState code = _child->work(&id); - - if (PlanStage::ADVANCED == code) { - // Add it into the map for quick invalidation if it has a valid RecordId. - // A RecordId may be invalidated at any time (during a yield). We need to get into - // the WorkingSet as quickly as possible to handle it. - WorkingSetMember* member = _ws->get(id); - - // Planner must put a fetch before we get here. - verify(member->hasObj()); - - // We might be sorting something that was invalidated at some point. - if (member->hasLoc()) { - _wsidByDiskLoc[member->loc] = id; - } - - // The data remains in the WorkingSet and we wrap the WSID with the sort key. - SortableDataItem item; - Status sortKeyStatus = _sortKeyGen->getSortKey(*member, &item.sortKey); - if (!_sortKeyGen->getSortKey(*member, &item.sortKey).isOK()) { - *out = WorkingSetCommon::allocateStatusMember(_ws, sortKeyStatus); - return PlanStage::FAILURE; - } - item.wsid = id; - if (member->hasLoc()) { - // The RecordId breaks ties when sorting two WSMs with the same sort key. - item.loc = member->loc; - } - - addToBuffer(item); - - ++_commonStats.needTime; - return PlanStage::NEED_TIME; - } - else if (PlanStage::IS_EOF == code) { - // TODO: We don't need the lock for this. We could ask for a yield and do this work - // unlocked. Also, this is performing a lot of work for one call to work(...) - sortBuffer(); - _resultIterator = _data.begin(); - _sorted = true; - ++_commonStats.needTime; - return PlanStage::NEED_TIME; - } - else if (PlanStage::FAILURE == code || PlanStage::DEAD == code) { - *out = id; - // If a stage fails, it may create a status WSM to indicate why it - // failed, in which case 'id' is valid. If ID is invalid, we - // create our own error message. - if (WorkingSet::INVALID_ID == id) { - mongoutils::str::stream ss; - ss << "sort stage failed to read in results to sort from child"; - Status status(ErrorCodes::InternalError, ss); - *out = WorkingSetCommon::allocateStatusMember( _ws, status); - } - return code; + // Planner must put a fetch before we get here. + verify(member->hasObj()); + + // We might be sorting something that was invalidated at some point. + if (member->hasLoc()) { + _wsidByDiskLoc[member->loc] = id; } - else if (PlanStage::NEED_TIME == code) { - ++_commonStats.needTime; + + // The data remains in the WorkingSet and we wrap the WSID with the sort key. + SortableDataItem item; + Status sortKeyStatus = _sortKeyGen->getSortKey(*member, &item.sortKey); + if (!_sortKeyGen->getSortKey(*member, &item.sortKey).isOK()) { + *out = WorkingSetCommon::allocateStatusMember(_ws, sortKeyStatus); + return PlanStage::FAILURE; } - else if (PlanStage::NEED_YIELD == code) { - ++_commonStats.needYield; - *out = id; + item.wsid = id; + if (member->hasLoc()) { + // The RecordId breaks ties when sorting two WSMs with the same sort key. + item.loc = member->loc; } - return code; - } - - // Returning results. - verify(_resultIterator != _data.end()); - verify(_sorted); - *out = _resultIterator->wsid; - _resultIterator++; + addToBuffer(item); - // If we're returning something, take it out of our DL -> WSID map so that future - // calls to invalidate don't cause us to take action for a DL we're done with. - WorkingSetMember* member = _ws->get(*out); - if (member->hasLoc()) { - _wsidByDiskLoc.erase(member->loc); + ++_commonStats.needTime; + return PlanStage::NEED_TIME; + } else if (PlanStage::IS_EOF == code) { + // TODO: We don't need the lock for this. We could ask for a yield and do this work + // unlocked. Also, this is performing a lot of work for one call to work(...) + sortBuffer(); + _resultIterator = _data.begin(); + _sorted = true; + ++_commonStats.needTime; + return PlanStage::NEED_TIME; + } else if (PlanStage::FAILURE == code || PlanStage::DEAD == code) { + *out = id; + // If a stage fails, it may create a status WSM to indicate why it + // failed, in which case 'id' is valid. If ID is invalid, we + // create our own error message. + if (WorkingSet::INVALID_ID == id) { + mongoutils::str::stream ss; + ss << "sort stage failed to read in results to sort from child"; + Status status(ErrorCodes::InternalError, ss); + *out = WorkingSetCommon::allocateStatusMember(_ws, status); + } + return code; + } else if (PlanStage::NEED_TIME == code) { + ++_commonStats.needTime; + } else if (PlanStage::NEED_YIELD == code) { + ++_commonStats.needYield; + *out = id; } - ++_commonStats.advanced; - return PlanStage::ADVANCED; - } - - void SortStage::saveState() { - ++_commonStats.yields; - _child->saveState(); + return code; } - void SortStage::restoreState(OperationContext* opCtx) { - ++_commonStats.unyields; - _child->restoreState(opCtx); + // Returning results. + verify(_resultIterator != _data.end()); + verify(_sorted); + *out = _resultIterator->wsid; + _resultIterator++; + + // If we're returning something, take it out of our DL -> WSID map so that future + // calls to invalidate don't cause us to take action for a DL we're done with. + WorkingSetMember* member = _ws->get(*out); + if (member->hasLoc()) { + _wsidByDiskLoc.erase(member->loc); } - void SortStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { - ++_commonStats.invalidates; - _child->invalidate(txn, dl, type); - - // If we have a deletion, we can fetch and carry on. - // If we have a mutation, it's easier to fetch and use the previous document. - // So, no matter what, fetch and keep the doc in play. + ++_commonStats.advanced; + return PlanStage::ADVANCED; +} - // _data contains indices into the WorkingSet, not actual data. If a WorkingSetMember in - // the WorkingSet needs to change state as a result of a RecordId invalidation, it will still - // be at the same spot in the WorkingSet. As such, we don't need to modify _data. - DataMap::iterator it = _wsidByDiskLoc.find(dl); +void SortStage::saveState() { + ++_commonStats.yields; + _child->saveState(); +} - // If we're holding on to data that's got the RecordId we're invalidating... - if (_wsidByDiskLoc.end() != it) { - // Grab the WSM that we're nuking. - WorkingSetMember* member = _ws->get(it->second); - verify(member->loc == dl); +void SortStage::restoreState(OperationContext* opCtx) { + ++_commonStats.unyields; + _child->restoreState(opCtx); +} - WorkingSetCommon::fetchAndInvalidateLoc(txn, member, _collection); +void SortStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + ++_commonStats.invalidates; + _child->invalidate(txn, dl, type); - // Remove the RecordId from our set of active DLs. - _wsidByDiskLoc.erase(it); - ++_specificStats.forcedFetches; - } - } + // If we have a deletion, we can fetch and carry on. + // If we have a mutation, it's easier to fetch and use the previous document. + // So, no matter what, fetch and keep the doc in play. - vector<PlanStage*> SortStage::getChildren() const { - vector<PlanStage*> children; - children.push_back(_child.get()); - return children; - } + // _data contains indices into the WorkingSet, not actual data. If a WorkingSetMember in + // the WorkingSet needs to change state as a result of a RecordId invalidation, it will still + // be at the same spot in the WorkingSet. As such, we don't need to modify _data. + DataMap::iterator it = _wsidByDiskLoc.find(dl); - PlanStageStats* SortStage::getStats() { - _commonStats.isEOF = isEOF(); - const size_t maxBytes = static_cast<size_t>(internalQueryExecMaxBlockingSortBytes); - _specificStats.memLimit = maxBytes; - _specificStats.memUsage = _memUsage; - _specificStats.limit = _limit; - _specificStats.sortPattern = _pattern.getOwned(); - - unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_SORT)); - ret->specific.reset(new SortStats(_specificStats)); - ret->children.push_back(_child->getStats()); - return ret.release(); - } + // If we're holding on to data that's got the RecordId we're invalidating... + if (_wsidByDiskLoc.end() != it) { + // Grab the WSM that we're nuking. + WorkingSetMember* member = _ws->get(it->second); + verify(member->loc == dl); - const CommonStats* SortStage::getCommonStats() const { - return &_commonStats; - } + WorkingSetCommon::fetchAndInvalidateLoc(txn, member, _collection); - const SpecificStats* SortStage::getSpecificStats() const { - return &_specificStats; + // Remove the RecordId from our set of active DLs. + _wsidByDiskLoc.erase(it); + ++_specificStats.forcedFetches; } +} + +vector<PlanStage*> SortStage::getChildren() const { + vector<PlanStage*> children; + children.push_back(_child.get()); + return children; +} + +PlanStageStats* SortStage::getStats() { + _commonStats.isEOF = isEOF(); + const size_t maxBytes = static_cast<size_t>(internalQueryExecMaxBlockingSortBytes); + _specificStats.memLimit = maxBytes; + _specificStats.memUsage = _memUsage; + _specificStats.limit = _limit; + _specificStats.sortPattern = _pattern.getOwned(); + + unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_SORT)); + ret->specific.reset(new SortStats(_specificStats)); + ret->children.push_back(_child->getStats()); + return ret.release(); +} + +const CommonStats* SortStage::getCommonStats() const { + return &_commonStats; +} + +const SpecificStats* SortStage::getSpecificStats() const { + return &_specificStats; +} - /** - * addToBuffer() and sortBuffer() work differently based on the - * configured limit. addToBuffer() is also responsible for - * performing some accounting on the overall memory usage to - * make sure we're not using too much memory. - * - * limit == 0: - * addToBuffer() - Adds item to vector. - * sortBuffer() - Sorts vector. - * limit == 1: - * addToBuffer() - Replaces first item in vector with max of - * current and new item. - * Updates memory usage if item was replaced. - * sortBuffer() - Does nothing. - * limit > 1: - * addToBuffer() - Does not update vector. Adds item to set. - * If size of set exceeds limit, remove item from set - * with lowest key. Updates memory usage accordingly. - * sortBuffer() - Copies items from set to vectors. - */ - void SortStage::addToBuffer(const SortableDataItem& item) { - // Holds ID of working set member to be freed at end of this function. - WorkingSetID wsidToFree = WorkingSet::INVALID_ID; - - if (_limit == 0) { +/** + * addToBuffer() and sortBuffer() work differently based on the + * configured limit. addToBuffer() is also responsible for + * performing some accounting on the overall memory usage to + * make sure we're not using too much memory. + * + * limit == 0: + * addToBuffer() - Adds item to vector. + * sortBuffer() - Sorts vector. + * limit == 1: + * addToBuffer() - Replaces first item in vector with max of + * current and new item. + * Updates memory usage if item was replaced. + * sortBuffer() - Does nothing. + * limit > 1: + * addToBuffer() - Does not update vector. Adds item to set. + * If size of set exceeds limit, remove item from set + * with lowest key. Updates memory usage accordingly. + * sortBuffer() - Copies items from set to vectors. + */ +void SortStage::addToBuffer(const SortableDataItem& item) { + // Holds ID of working set member to be freed at end of this function. + WorkingSetID wsidToFree = WorkingSet::INVALID_ID; + + if (_limit == 0) { + _data.push_back(item); + _memUsage += _ws->get(item.wsid)->getMemUsage(); + } else if (_limit == 1) { + if (_data.empty()) { _data.push_back(item); - _memUsage += _ws->get(item.wsid)->getMemUsage(); + _memUsage = _ws->get(item.wsid)->getMemUsage(); + return; } - else if (_limit == 1) { - if (_data.empty()) { - _data.push_back(item); - _memUsage = _ws->get(item.wsid)->getMemUsage(); - return; - } - wsidToFree = item.wsid; - const WorkingSetComparator& cmp = *_sortKeyComparator; - // Compare new item with existing item in vector. - if (cmp(item, _data[0])) { - wsidToFree = _data[0].wsid; - _data[0] = item; - _memUsage = _ws->get(item.wsid)->getMemUsage(); - } + wsidToFree = item.wsid; + const WorkingSetComparator& cmp = *_sortKeyComparator; + // Compare new item with existing item in vector. + if (cmp(item, _data[0])) { + wsidToFree = _data[0].wsid; + _data[0] = item; + _memUsage = _ws->get(item.wsid)->getMemUsage(); } - else { - // Update data item set instead of vector - // Limit not reached - insert and return - vector<SortableDataItem>::size_type limit(_limit); - if (_dataSet->size() < limit) { - _dataSet->insert(item); - _memUsage += _ws->get(item.wsid)->getMemUsage(); - return; - } - // Limit will be exceeded - compare with item with lowest key - // If new item does not have a lower key value than last item, - // do nothing. - wsidToFree = item.wsid; - SortableDataItemSet::const_iterator lastItemIt = --(_dataSet->end()); - const SortableDataItem& lastItem = *lastItemIt; - const WorkingSetComparator& cmp = *_sortKeyComparator; - if (cmp(item, lastItem)) { - _memUsage -= _ws->get(lastItem.wsid)->getMemUsage(); - _memUsage += _ws->get(item.wsid)->getMemUsage(); - wsidToFree = lastItem.wsid; - // According to std::set iterator validity rules, - // it does not matter which of erase()/insert() happens first. - // Here, we choose to erase first to release potential resources - // used by the last item and to keep the scope of the iterator to a minimum. - _dataSet->erase(lastItemIt); - _dataSet->insert(item); - } + } else { + // Update data item set instead of vector + // Limit not reached - insert and return + vector<SortableDataItem>::size_type limit(_limit); + if (_dataSet->size() < limit) { + _dataSet->insert(item); + _memUsage += _ws->get(item.wsid)->getMemUsage(); + return; } - - // If the working set ID is valid, remove from - // RecordId invalidation map and free from working set. - if (wsidToFree != WorkingSet::INVALID_ID) { - WorkingSetMember* member = _ws->get(wsidToFree); - if (member->hasLoc()) { - _wsidByDiskLoc.erase(member->loc); - } - _ws->free(wsidToFree); + // Limit will be exceeded - compare with item with lowest key + // If new item does not have a lower key value than last item, + // do nothing. + wsidToFree = item.wsid; + SortableDataItemSet::const_iterator lastItemIt = --(_dataSet->end()); + const SortableDataItem& lastItem = *lastItemIt; + const WorkingSetComparator& cmp = *_sortKeyComparator; + if (cmp(item, lastItem)) { + _memUsage -= _ws->get(lastItem.wsid)->getMemUsage(); + _memUsage += _ws->get(item.wsid)->getMemUsage(); + wsidToFree = lastItem.wsid; + // According to std::set iterator validity rules, + // it does not matter which of erase()/insert() happens first. + // Here, we choose to erase first to release potential resources + // used by the last item and to keep the scope of the iterator to a minimum. + _dataSet->erase(lastItemIt); + _dataSet->insert(item); } } - void SortStage::sortBuffer() { - if (_limit == 0) { - const WorkingSetComparator& cmp = *_sortKeyComparator; - std::sort(_data.begin(), _data.end(), cmp); - } - else if (_limit == 1) { - // Buffer contains either 0 or 1 item so it is already in a sorted state. - return; - } - else { - // Set already contains items in sorted order, so we simply copy the items - // from the set to the vector. - // Release the memory for the set after the copy. - vector<SortableDataItem> newData(_dataSet->begin(), _dataSet->end()); - _data.swap(newData); - _dataSet.reset(); + // If the working set ID is valid, remove from + // RecordId invalidation map and free from working set. + if (wsidToFree != WorkingSet::INVALID_ID) { + WorkingSetMember* member = _ws->get(wsidToFree); + if (member->hasLoc()) { + _wsidByDiskLoc.erase(member->loc); } + _ws->free(wsidToFree); + } +} + +void SortStage::sortBuffer() { + if (_limit == 0) { + const WorkingSetComparator& cmp = *_sortKeyComparator; + std::sort(_data.begin(), _data.end(), cmp); + } else if (_limit == 1) { + // Buffer contains either 0 or 1 item so it is already in a sorted state. + return; + } else { + // Set already contains items in sorted order, so we simply copy the items + // from the set to the vector. + // Release the memory for the set after the copy. + vector<SortableDataItem> newData(_dataSet->begin(), _dataSet->end()); + _data.swap(newData); + _dataSet.reset(); } +} } // namespace mongo diff --git a/src/mongo/db/exec/sort.h b/src/mongo/db/exec/sort.h index 692b4b3b4cb..b04a3d07e43 100644 --- a/src/mongo/db/exec/sort.h +++ b/src/mongo/db/exec/sort.h @@ -41,235 +41,234 @@ namespace mongo { - class BtreeKeyGenerator; +class BtreeKeyGenerator; - // Parameters that must be provided to a SortStage - class SortStageParams { - public: - SortStageParams() : collection(NULL), limit(0) { } +// Parameters that must be provided to a SortStage +class SortStageParams { +public: + SortStageParams() : collection(NULL), limit(0) {} - // Used for resolving RecordIds to BSON - const Collection* collection; + // Used for resolving RecordIds to BSON + const Collection* collection; - // How we're sorting. - BSONObj pattern; + // How we're sorting. + BSONObj pattern; - // The query. Used to create the IndexBounds for the sorting. - BSONObj query; + // The query. Used to create the IndexBounds for the sorting. + BSONObj query; - // Equal to 0 for no limit. - size_t limit; - }; + // Equal to 0 for no limit. + size_t limit; +}; +/** + * Maps a WSM value to a BSONObj key that can then be sorted via BSONObjCmp. + */ +class SortStageKeyGenerator { +public: /** - * Maps a WSM value to a BSONObj key that can then be sorted via BSONObjCmp. + * 'sortSpec' is the BSONObj in the .sort(...) clause. + * + * 'queryObj' is the BSONObj in the .find(...) clause. For multikey arrays we have to + * ensure that the value we select to sort by is within bounds generated by + * executing 'queryObj' using the virtual index with key pattern 'sortSpec'. */ - class SortStageKeyGenerator { - public: - /** - * 'sortSpec' is the BSONObj in the .sort(...) clause. - * - * 'queryObj' is the BSONObj in the .find(...) clause. For multikey arrays we have to - * ensure that the value we select to sort by is within bounds generated by - * executing 'queryObj' using the virtual index with key pattern 'sortSpec'. - */ - SortStageKeyGenerator(const Collection* collection, - const BSONObj& sortSpec, - const BSONObj& queryObj); - - /** - * Returns the key used to sort 'member'. - */ - Status getSortKey(const WorkingSetMember& member, - BSONObj* objOut) const; - - /** - * Passed to std::sort and used to sort the keys that are returned from getSortKey. - * - * Returned reference lives as long as 'this'. - */ - const BSONObj& getSortComparator() const { return _comparatorObj; } - - private: - Status getBtreeKey(const BSONObj& memberObj, BSONObj* objOut) const; - - /** - * In order to emulate the existing sort behavior we must make unindexed sort behavior as - * consistent as possible with indexed sort behavior. As such, we must only consider index - * keys that we would encounter if we were answering the query using the sort-providing - * index. - * - * Populates _hasBounds and _bounds. - */ - void getBoundsForSort(const BSONObj& queryObj, - const BSONObj& sortObj); - - // Not owned by us - const Collection* _collection; - - // The object that we use to call woCompare on our resulting key. Is equal to _rawSortSpec - // unless we have some $meta expressions. Each $meta expression has a default sort order. - BSONObj _comparatorObj; - - // The raw object in .sort() - BSONObj _rawSortSpec; - - // The sort pattern with any non-Btree sort pulled out. - BSONObj _btreeObj; - - // If we're not sorting with a $meta value we can short-cut some work. - bool _sortHasMeta; - - // True if the bounds are valid. - bool _hasBounds; - - // The bounds generated from the query we're sorting. - IndexBounds _bounds; - - // Helper to extract sorting keys from documents. - std::unique_ptr<BtreeKeyGenerator> _keyGen; - - // Helper to filter keys, ensuring keys generated with _keyGen are within _bounds. - std::unique_ptr<IndexBoundsChecker> _boundsChecker; - }; + SortStageKeyGenerator(const Collection* collection, + const BSONObj& sortSpec, + const BSONObj& queryObj); + + /** + * Returns the key used to sort 'member'. + */ + Status getSortKey(const WorkingSetMember& member, BSONObj* objOut) const; + + /** + * Passed to std::sort and used to sort the keys that are returned from getSortKey. + * + * Returned reference lives as long as 'this'. + */ + const BSONObj& getSortComparator() const { + return _comparatorObj; + } + +private: + Status getBtreeKey(const BSONObj& memberObj, BSONObj* objOut) const; /** - * Sorts the input received from the child according to the sort pattern provided. + * In order to emulate the existing sort behavior we must make unindexed sort behavior as + * consistent as possible with indexed sort behavior. As such, we must only consider index + * keys that we would encounter if we were answering the query using the sort-providing + * index. * - * Preconditions: For each field in 'pattern', all inputs in the child must handle a - * getFieldDotted for that field. + * Populates _hasBounds and _bounds. */ - class SortStage : public PlanStage { - public: - SortStage(const SortStageParams& params, - WorkingSet* ws, - PlanStage* child); + void getBoundsForSort(const BSONObj& queryObj, const BSONObj& sortObj); - virtual ~SortStage(); + // Not owned by us + const Collection* _collection; - virtual bool isEOF(); - virtual StageState work(WorkingSetID* out); + // The object that we use to call woCompare on our resulting key. Is equal to _rawSortSpec + // unless we have some $meta expressions. Each $meta expression has a default sort order. + BSONObj _comparatorObj; - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + // The raw object in .sort() + BSONObj _rawSortSpec; - virtual std::vector<PlanStage*> getChildren() const; + // The sort pattern with any non-Btree sort pulled out. + BSONObj _btreeObj; - virtual StageType stageType() const { return STAGE_SORT; } + // If we're not sorting with a $meta value we can short-cut some work. + bool _sortHasMeta; - PlanStageStats* getStats(); + // True if the bounds are valid. + bool _hasBounds; - virtual const CommonStats* getCommonStats() const; + // The bounds generated from the query we're sorting. + IndexBounds _bounds; - virtual const SpecificStats* getSpecificStats() const; + // Helper to extract sorting keys from documents. + std::unique_ptr<BtreeKeyGenerator> _keyGen; - static const char* kStageType; + // Helper to filter keys, ensuring keys generated with _keyGen are within _bounds. + std::unique_ptr<IndexBoundsChecker> _boundsChecker; +}; - private: +/** + * Sorts the input received from the child according to the sort pattern provided. + * + * Preconditions: For each field in 'pattern', all inputs in the child must handle a + * getFieldDotted for that field. + */ +class SortStage : public PlanStage { +public: + SortStage(const SortStageParams& params, WorkingSet* ws, PlanStage* child); + + virtual ~SortStage(); - // - // Query Stage - // + virtual bool isEOF(); + virtual StageState work(WorkingSetID* out); - // Not owned by us. - const Collection* _collection; + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - // Not owned by us. - WorkingSet* _ws; + virtual std::vector<PlanStage*> getChildren() const; - // Where we're reading data to sort from. - std::unique_ptr<PlanStage> _child; + virtual StageType stageType() const { + return STAGE_SORT; + } - // The raw sort _pattern as expressed by the user - BSONObj _pattern; + PlanStageStats* getStats(); - // The raw query as expressed by the user - BSONObj _query; + virtual const CommonStats* getCommonStats() const; - // Equal to 0 for no limit. - size_t _limit; + virtual const SpecificStats* getSpecificStats() const; - // - // Sort key generation - // - std::unique_ptr<SortStageKeyGenerator> _sortKeyGen; + static const char* kStageType; - // - // Data storage - // +private: + // + // Query Stage + // - // Have we sorted our data? If so, we can access _resultIterator. If not, - // we're still populating _data. - bool _sorted; + // Not owned by us. + const Collection* _collection; - // Collection of working set members to sort with their respective sort key. - struct SortableDataItem { - WorkingSetID wsid; - BSONObj sortKey; - // Since we must replicate the behavior of a covered sort as much as possible we use the - // RecordId to break sortKey ties. - // See sorta.js. - RecordId loc; - }; + // Not owned by us. + WorkingSet* _ws; - // Comparison object for data buffers (vector and set). - // Items are compared on (sortKey, loc). This is also how the items are - // ordered in the indices. - // Keys are compared using BSONObj::woCompare() with RecordId as a tie-breaker. - struct WorkingSetComparator { - explicit WorkingSetComparator(BSONObj p); + // Where we're reading data to sort from. + std::unique_ptr<PlanStage> _child; - bool operator()(const SortableDataItem& lhs, const SortableDataItem& rhs) const; + // The raw sort _pattern as expressed by the user + BSONObj _pattern; - BSONObj pattern; - }; + // The raw query as expressed by the user + BSONObj _query; - /** - * Inserts one item into data buffer (vector or set). - * If limit is exceeded, remove item with lowest key. - */ - void addToBuffer(const SortableDataItem& item); + // Equal to 0 for no limit. + size_t _limit; - /** - * Sorts data buffer. - * Assumes no more items will be added to buffer. - * If data is stored in set, copy set - * contents to vector and clear set. - */ - void sortBuffer(); + // + // Sort key generation + // + std::unique_ptr<SortStageKeyGenerator> _sortKeyGen; - // Comparator for data buffer - // Initialization follows sort key generator - std::unique_ptr<WorkingSetComparator> _sortKeyComparator; + // + // Data storage + // + + // Have we sorted our data? If so, we can access _resultIterator. If not, + // we're still populating _data. + bool _sorted; + + // Collection of working set members to sort with their respective sort key. + struct SortableDataItem { + WorkingSetID wsid; + BSONObj sortKey; + // Since we must replicate the behavior of a covered sort as much as possible we use the + // RecordId to break sortKey ties. + // See sorta.js. + RecordId loc; + }; - // The data we buffer and sort. - // _data will contain sorted data when all data is gathered - // and sorted. - // When _limit is greater than 1 and not all data has been gathered from child stage, - // _dataSet is used instead to maintain an ordered set of the incomplete data set. - // When the data set is complete, we copy the items from _dataSet to _data which will - // be used to provide the results of this stage through _resultIterator. - std::vector<SortableDataItem> _data; - typedef std::set<SortableDataItem, WorkingSetComparator> SortableDataItemSet; - std::unique_ptr<SortableDataItemSet> _dataSet; + // Comparison object for data buffers (vector and set). + // Items are compared on (sortKey, loc). This is also how the items are + // ordered in the indices. + // Keys are compared using BSONObj::woCompare() with RecordId as a tie-breaker. + struct WorkingSetComparator { + explicit WorkingSetComparator(BSONObj p); - // Iterates through _data post-sort returning it. - std::vector<SortableDataItem>::iterator _resultIterator; + bool operator()(const SortableDataItem& lhs, const SortableDataItem& rhs) const; - // We buffer a lot of data and we want to look it up by RecordId quickly upon invalidation. - typedef unordered_map<RecordId, WorkingSetID, RecordId::Hasher> DataMap; - DataMap _wsidByDiskLoc; - - // - // Stats - // - - CommonStats _commonStats; - SortStats _specificStats; - - // The usage in bytes of all buffered data that we're sorting. - size_t _memUsage; + BSONObj pattern; }; + /** + * Inserts one item into data buffer (vector or set). + * If limit is exceeded, remove item with lowest key. + */ + void addToBuffer(const SortableDataItem& item); + + /** + * Sorts data buffer. + * Assumes no more items will be added to buffer. + * If data is stored in set, copy set + * contents to vector and clear set. + */ + void sortBuffer(); + + // Comparator for data buffer + // Initialization follows sort key generator + std::unique_ptr<WorkingSetComparator> _sortKeyComparator; + + // The data we buffer and sort. + // _data will contain sorted data when all data is gathered + // and sorted. + // When _limit is greater than 1 and not all data has been gathered from child stage, + // _dataSet is used instead to maintain an ordered set of the incomplete data set. + // When the data set is complete, we copy the items from _dataSet to _data which will + // be used to provide the results of this stage through _resultIterator. + std::vector<SortableDataItem> _data; + typedef std::set<SortableDataItem, WorkingSetComparator> SortableDataItemSet; + std::unique_ptr<SortableDataItemSet> _dataSet; + + // Iterates through _data post-sort returning it. + std::vector<SortableDataItem>::iterator _resultIterator; + + // We buffer a lot of data and we want to look it up by RecordId quickly upon invalidation. + typedef unordered_map<RecordId, WorkingSetID, RecordId::Hasher> DataMap; + DataMap _wsidByDiskLoc; + + // + // Stats + // + + CommonStats _commonStats; + SortStats _specificStats; + + // The usage in bytes of all buffered data that we're sorting. + size_t _memUsage; +}; + } // namespace mongo diff --git a/src/mongo/db/exec/sort_test.cpp b/src/mongo/db/exec/sort_test.cpp index 8f6bf64ca79..0d4cc891952 100644 --- a/src/mongo/db/exec/sort_test.cpp +++ b/src/mongo/db/exec/sort_test.cpp @@ -41,201 +41,205 @@ using namespace mongo; namespace { - TEST(SortStageTest, SortEmptyWorkingSet) { - WorkingSet ws; +TEST(SortStageTest, SortEmptyWorkingSet) { + WorkingSet ws; - // QueuedDataStage will be owned by SortStage. - QueuedDataStage* ms = new QueuedDataStage(&ws); - SortStageParams params; - SortStage sort(params, &ws, ms); + // QueuedDataStage will be owned by SortStage. + QueuedDataStage* ms = new QueuedDataStage(&ws); + SortStageParams params; + SortStage sort(params, &ws, ms); - // Check initial EOF state. - ASSERT_TRUE(ms->isEOF()); - ASSERT_FALSE(sort.isEOF()); + // Check initial EOF state. + ASSERT_TRUE(ms->isEOF()); + ASSERT_FALSE(sort.isEOF()); - // First call to work() initializes sort key generator. - WorkingSetID id = WorkingSet::INVALID_ID; - PlanStage::StageState state = sort.work(&id); - ASSERT_EQUALS(state, PlanStage::NEED_TIME); + // First call to work() initializes sort key generator. + WorkingSetID id = WorkingSet::INVALID_ID; + PlanStage::StageState state = sort.work(&id); + ASSERT_EQUALS(state, PlanStage::NEED_TIME); - // Second call to work() sorts data in vector. - state = sort.work(&id); - ASSERT_EQUALS(state, PlanStage::NEED_TIME); - - // Finally we hit EOF. - state = sort.work(&id); - ASSERT_EQUALS(state, PlanStage::IS_EOF); - - ASSERT_TRUE(sort.isEOF()); - } - - /** - * Test function to verify sort stage. - * SortStageParams will be initialized using patternStr, queryStr and limit. - * inputStr represents the input data set in a BSONObj. - * {input: [doc1, doc2, doc3, ...]} - * expectedStr represents the expected sorted data set. - * {output: [docA, docB, docC, ...]} - */ - void testWork(const char* patternStr, const char* queryStr, int limit, - const char* inputStr, const char* expectedStr) { - - // WorkingSet is not owned by stages - // so it's fine to declare - WorkingSet ws; - - // QueuedDataStage will be owned by SortStage. - QueuedDataStage* ms = new QueuedDataStage(&ws); - BSONObj inputObj = fromjson(inputStr); - BSONElement inputElt = inputObj.getField("input"); - ASSERT(inputElt.isABSONObj()); - BSONObjIterator inputIt(inputElt.embeddedObject()); - while (inputIt.more()) { - BSONElement elt = inputIt.next(); - ASSERT(elt.isABSONObj()); - BSONObj obj = elt.embeddedObject(); - - // Insert obj from input array into working set. - WorkingSetMember wsm; - wsm.state = WorkingSetMember::OWNED_OBJ; - wsm.obj = Snapshotted<BSONObj>(SnapshotId(), obj); - ms->pushBack(wsm); - } - - // Initialize SortStageParams - // Setting limit to 0 means no limit - SortStageParams params; - params.pattern = fromjson(patternStr); - params.query = fromjson(queryStr); - params.limit = limit; - - SortStage sort(params, &ws, ms); - - WorkingSetID id = WorkingSet::INVALID_ID; - PlanStage::StageState state = PlanStage::NEED_TIME; - - // Keep working sort stage until data is available. - while (state == PlanStage::NEED_TIME) { - state = sort.work(&id); - } - - // Child's state should be EOF when sort is ready to advance. - ASSERT_TRUE(ms->isEOF()); - - // While there's data to be retrieved, state should be equal to ADVANCED. - // Insert documents into BSON document in this format: - // {output: [docA, docB, docC, ...]} - BSONObjBuilder bob; - BSONArrayBuilder arr(bob.subarrayStart("output")); - while (state == PlanStage::ADVANCED) { - WorkingSetMember* member = ws.get(id); - const BSONObj& obj = member->obj.value(); - arr.append(obj); - state = sort.work(&id); - } - arr.doneFast(); - BSONObj outputObj = bob.obj(); - - // Sort stage should be EOF after data is retrieved. - ASSERT_EQUALS(state, PlanStage::IS_EOF); - ASSERT_TRUE(sort.isEOF()); - - // Finally, we get to compare the sorted results against what we expect. - BSONObj expectedObj = fromjson(expectedStr); - if (outputObj != expectedObj) { - mongoutils::str::stream ss; - // Even though we have the original string representation of the expected output, - // we invoke BSONObj::toString() to get a format consistent with outputObj. - ss << "Unexpected sort result with query=" << queryStr << "; pattern=" << patternStr - << "; limit=" << limit << ":\n" - << "Expected: " << expectedObj.toString() << "\n" - << "Actual: " << outputObj.toString() << "\n"; - FAIL(ss); - } - } - - // - // Limit values - // The server interprets limit values from the user as follows: - // 0: no limit on query results. This is passed along unchanged to the sort stage. - // >0: soft limit. Also unchanged in sort stage. - // <0: hard limit. Absolute value is stored in parsed query and passed to sort stage. - // The sort stage treats both soft and hard limits in the same manner - - // - // Sort without limit - // Implementation should keep all items fetched from child. - // - - TEST(SortStageTest, SortAscending) { - testWork("{a: 1}", "{}", 0, - "{input: [{a: 2}, {a: 1}, {a: 3}]}", - "{output: [{a: 1}, {a: 2}, {a: 3}]}"); - } - - TEST(SortStageTest, SortDescending) { - testWork("{a: -1}", "{}", 0, - "{input: [{a: 2}, {a: 1}, {a: 3}]}", - "{output: [{a: 3}, {a: 2}, {a: 1}]}"); - } + // Second call to work() sorts data in vector. + state = sort.work(&id); + ASSERT_EQUALS(state, PlanStage::NEED_TIME); - TEST(SortStageTest, SortIrrelevantSortKey) { - testWork("{b: 1}", "{}", 0, - "{input: [{a: 2}, {a: 1}, {a: 3}]}", - "{output: [{a: 2}, {a: 1}, {a: 3}]}"); - } + // Finally we hit EOF. + state = sort.work(&id); + ASSERT_EQUALS(state, PlanStage::IS_EOF); - // - // Sorting with limit > 1 - // Implementation should retain top N items - // and discard the rest. - // + ASSERT_TRUE(sort.isEOF()); +} - TEST(SortStageTest, SortAscendingWithLimit) { - testWork("{a: 1}", "{}", 2, - "{input: [{a: 2}, {a: 1}, {a: 3}]}", - "{output: [{a: 1}, {a: 2}]}"); +/** + * Test function to verify sort stage. + * SortStageParams will be initialized using patternStr, queryStr and limit. + * inputStr represents the input data set in a BSONObj. + * {input: [doc1, doc2, doc3, ...]} + * expectedStr represents the expected sorted data set. + * {output: [docA, docB, docC, ...]} + */ +void testWork(const char* patternStr, + const char* queryStr, + int limit, + const char* inputStr, + const char* expectedStr) { + // WorkingSet is not owned by stages + // so it's fine to declare + WorkingSet ws; + + // QueuedDataStage will be owned by SortStage. + QueuedDataStage* ms = new QueuedDataStage(&ws); + BSONObj inputObj = fromjson(inputStr); + BSONElement inputElt = inputObj.getField("input"); + ASSERT(inputElt.isABSONObj()); + BSONObjIterator inputIt(inputElt.embeddedObject()); + while (inputIt.more()) { + BSONElement elt = inputIt.next(); + ASSERT(elt.isABSONObj()); + BSONObj obj = elt.embeddedObject(); + + // Insert obj from input array into working set. + WorkingSetMember wsm; + wsm.state = WorkingSetMember::OWNED_OBJ; + wsm.obj = Snapshotted<BSONObj>(SnapshotId(), obj); + ms->pushBack(wsm); } - TEST(SortStageTest, SortDescendingWithLimit) { - testWork("{a: -1}", "{}", 2, - "{input: [{a: 2}, {a: 1}, {a: 3}]}", - "{output: [{a: 3}, {a: 2}]}"); - } + // Initialize SortStageParams + // Setting limit to 0 means no limit + SortStageParams params; + params.pattern = fromjson(patternStr); + params.query = fromjson(queryStr); + params.limit = limit; - // - // Sorting with limit > size of data set - // Implementation should retain top N items - // and discard the rest. - // + SortStage sort(params, &ws, ms); - TEST(SortStageTest, SortAscendingWithLimitGreaterThanInputSize) { - testWork("{a: 1}", "{}", 10, - "{input: [{a: 2}, {a: 1}, {a: 3}]}", - "{output: [{a: 1}, {a: 2}, {a: 3}]}"); - } + WorkingSetID id = WorkingSet::INVALID_ID; + PlanStage::StageState state = PlanStage::NEED_TIME; - TEST(SortStageTest, SortDescendingWithLimitGreaterThanInputSize) { - testWork("{a: -1}", "{}", 10, - "{input: [{a: 2}, {a: 1}, {a: 3}]}", - "{output: [{a: 3}, {a: 2}, {a: 1}]}"); + // Keep working sort stage until data is available. + while (state == PlanStage::NEED_TIME) { + state = sort.work(&id); } - // - // Sorting with limit 1 - // Implementation should optimize this into a running maximum. - // - - TEST(SortStageTest, SortAscendingWithLimitOfOne) { - testWork("{a: 1}", "{}", 1, - "{input: [{a: 2}, {a: 1}, {a: 3}]}", - "{output: [{a: 1}]}"); + // Child's state should be EOF when sort is ready to advance. + ASSERT_TRUE(ms->isEOF()); + + // While there's data to be retrieved, state should be equal to ADVANCED. + // Insert documents into BSON document in this format: + // {output: [docA, docB, docC, ...]} + BSONObjBuilder bob; + BSONArrayBuilder arr(bob.subarrayStart("output")); + while (state == PlanStage::ADVANCED) { + WorkingSetMember* member = ws.get(id); + const BSONObj& obj = member->obj.value(); + arr.append(obj); + state = sort.work(&id); } - - TEST(SortStageTest, SortDescendingWithLimitOfOne) { - testWork("{a: -1}", "{}", 1, - "{input: [{a: 2}, {a: 1}, {a: 3}]}", - "{output: [{a: 3}]}"); + arr.doneFast(); + BSONObj outputObj = bob.obj(); + + // Sort stage should be EOF after data is retrieved. + ASSERT_EQUALS(state, PlanStage::IS_EOF); + ASSERT_TRUE(sort.isEOF()); + + // Finally, we get to compare the sorted results against what we expect. + BSONObj expectedObj = fromjson(expectedStr); + if (outputObj != expectedObj) { + mongoutils::str::stream ss; + // Even though we have the original string representation of the expected output, + // we invoke BSONObj::toString() to get a format consistent with outputObj. + ss << "Unexpected sort result with query=" << queryStr << "; pattern=" << patternStr + << "; limit=" << limit << ":\n" + << "Expected: " << expectedObj.toString() << "\n" + << "Actual: " << outputObj.toString() << "\n"; + FAIL(ss); } +} + +// +// Limit values +// The server interprets limit values from the user as follows: +// 0: no limit on query results. This is passed along unchanged to the sort stage. +// >0: soft limit. Also unchanged in sort stage. +// <0: hard limit. Absolute value is stored in parsed query and passed to sort stage. +// The sort stage treats both soft and hard limits in the same manner + +// +// Sort without limit +// Implementation should keep all items fetched from child. +// + +TEST(SortStageTest, SortAscending) { + testWork("{a: 1}", + "{}", + 0, + "{input: [{a: 2}, {a: 1}, {a: 3}]}", + "{output: [{a: 1}, {a: 2}, {a: 3}]}"); +} + +TEST(SortStageTest, SortDescending) { + testWork("{a: -1}", + "{}", + 0, + "{input: [{a: 2}, {a: 1}, {a: 3}]}", + "{output: [{a: 3}, {a: 2}, {a: 1}]}"); +} + +TEST(SortStageTest, SortIrrelevantSortKey) { + testWork("{b: 1}", + "{}", + 0, + "{input: [{a: 2}, {a: 1}, {a: 3}]}", + "{output: [{a: 2}, {a: 1}, {a: 3}]}"); +} + +// +// Sorting with limit > 1 +// Implementation should retain top N items +// and discard the rest. +// + +TEST(SortStageTest, SortAscendingWithLimit) { + testWork("{a: 1}", "{}", 2, "{input: [{a: 2}, {a: 1}, {a: 3}]}", "{output: [{a: 1}, {a: 2}]}"); +} + +TEST(SortStageTest, SortDescendingWithLimit) { + testWork("{a: -1}", "{}", 2, "{input: [{a: 2}, {a: 1}, {a: 3}]}", "{output: [{a: 3}, {a: 2}]}"); +} + +// +// Sorting with limit > size of data set +// Implementation should retain top N items +// and discard the rest. +// + +TEST(SortStageTest, SortAscendingWithLimitGreaterThanInputSize) { + testWork("{a: 1}", + "{}", + 10, + "{input: [{a: 2}, {a: 1}, {a: 3}]}", + "{output: [{a: 1}, {a: 2}, {a: 3}]}"); +} + +TEST(SortStageTest, SortDescendingWithLimitGreaterThanInputSize) { + testWork("{a: -1}", + "{}", + 10, + "{input: [{a: 2}, {a: 1}, {a: 3}]}", + "{output: [{a: 3}, {a: 2}, {a: 1}]}"); +} + +// +// Sorting with limit 1 +// Implementation should optimize this into a running maximum. +// + +TEST(SortStageTest, SortAscendingWithLimitOfOne) { + testWork("{a: 1}", "{}", 1, "{input: [{a: 2}, {a: 1}, {a: 3}]}", "{output: [{a: 1}]}"); +} + +TEST(SortStageTest, SortDescendingWithLimitOfOne) { + testWork("{a: -1}", "{}", 1, "{input: [{a: 2}, {a: 1}, {a: 3}]}", "{output: [{a: 3}]}"); +} } // namespace diff --git a/src/mongo/db/exec/stagedebug_cmd.cpp b/src/mongo/db/exec/stagedebug_cmd.cpp index e648a97518a..3dc0aa8549b 100644 --- a/src/mongo/db/exec/stagedebug_cmd.cpp +++ b/src/mongo/db/exec/stagedebug_cmd.cpp @@ -59,328 +59,318 @@ namespace mongo { - using std::unique_ptr; - using std::string; - using std::vector; - - /** - * A command for manually constructing a query tree and running it. - * - * db.runCommand({stageDebug: {collection: collname, plan: rootNode}}) - * - * The value of the filter field is a BSONObj that specifies values that fields must have. What - * you'd pass to a matcher. - * - * Leaf Nodes: - * - * node -> {ixscan: {filter: {FILTER}, - * args: {indexKeyPattern: kpObj, start: startObj, - * stop: stopObj, endInclusive: true/false, direction: -1/1, - * limit: int}}} - * node -> {cscan: {filter: {filter}, args: {direction: -1/1}}} - * TODO: language for text. - * node -> {text: {filter: {filter}, args: {search: "searchstr"}}} - * - * Internal Nodes: - * - * node -> {andHash: {args: { nodes: [node, node]}}} - * node -> {andSorted: {args: { nodes: [node, node]}}} - * node -> {or: {filter: {filter}, args: { dedup:bool, nodes:[node, node]}}} - * node -> {fetch: {filter: {filter}, args: {node: node}}} - * node -> {limit: {args: {node: node, num: posint}}} - * node -> {skip: {args: {node: node, num: posint}}} - * node -> {sort: {args: {node: node, pattern: objWithSortCriterion }}} - * node -> {mergeSort: {args: {nodes: [node, node], pattern: objWithSortCriterion}}} - * node -> {delete: {args: {node: node, isMulti: bool, shouldCallLogOp: bool}}} - * - * Forthcoming Nodes: - * - * node -> {dedup: {filter: {filter}, args: {node: node, field: field}}} - * node -> {unwind: {filter: filter}, args: {node: node, field: field}} - */ - class StageDebugCmd : public Command { - public: - StageDebugCmd() : Command("stageDebug") { } - - virtual bool isWriteCommandForConfigServer() const { return false; } - bool slaveOk() const { return false; } - bool slaveOverrideOk() const { return false; } - void help(std::stringstream& h) const { } - - virtual void addRequiredPrivileges(const std::string& dbname, - const BSONObj& cmdObj, - std::vector<Privilege>* out) { - // Command is testing-only, and can only be enabled at command line. Hence, no auth - // check needed. - } +using std::unique_ptr; +using std::string; +using std::vector; - bool run(OperationContext* txn, - const string& dbname, - BSONObj& cmdObj, - int, - string& errmsg, - BSONObjBuilder& result) { - BSONElement argElt = cmdObj["stageDebug"]; - if (argElt.eoo() || !argElt.isABSONObj()) { return false; } - BSONObj argObj = argElt.Obj(); - - // Pull out the collection name. - BSONElement collElt = argObj["collection"]; - if (collElt.eoo() || (String != collElt.type())) { - return false; - } - string collName = collElt.String(); - - // Need a context to get the actual Collection* - // TODO A write lock is currently taken here to accommodate stages that perform writes - // (e.g. DeleteStage). This should be changed to use a read lock for read-only - // execution trees. - ScopedTransaction transaction(txn, MODE_IX); - Lock::DBLock lk(txn->lockState(), dbname, MODE_X); - OldClientContext ctx(txn, dbname); - - // Make sure the collection is valid. - Database* db = ctx.db(); - Collection* collection = db->getCollection(db->name() + '.' + collName); - uassert(17446, "Couldn't find the collection " + collName, NULL != collection); - - // Pull out the plan - BSONElement planElt = argObj["plan"]; - if (planElt.eoo() || !planElt.isABSONObj()) { - return false; - } - BSONObj planObj = planElt.Obj(); +/** + * A command for manually constructing a query tree and running it. + * + * db.runCommand({stageDebug: {collection: collname, plan: rootNode}}) + * + * The value of the filter field is a BSONObj that specifies values that fields must have. What + * you'd pass to a matcher. + * + * Leaf Nodes: + * + * node -> {ixscan: {filter: {FILTER}, + * args: {indexKeyPattern: kpObj, start: startObj, + * stop: stopObj, endInclusive: true/false, direction: -1/1, + * limit: int}}} + * node -> {cscan: {filter: {filter}, args: {direction: -1/1}}} + * TODO: language for text. + * node -> {text: {filter: {filter}, args: {search: "searchstr"}}} + * + * Internal Nodes: + * + * node -> {andHash: {args: { nodes: [node, node]}}} + * node -> {andSorted: {args: { nodes: [node, node]}}} + * node -> {or: {filter: {filter}, args: { dedup:bool, nodes:[node, node]}}} + * node -> {fetch: {filter: {filter}, args: {node: node}}} + * node -> {limit: {args: {node: node, num: posint}}} + * node -> {skip: {args: {node: node, num: posint}}} + * node -> {sort: {args: {node: node, pattern: objWithSortCriterion }}} + * node -> {mergeSort: {args: {nodes: [node, node], pattern: objWithSortCriterion}}} + * node -> {delete: {args: {node: node, isMulti: bool, shouldCallLogOp: bool}}} + * + * Forthcoming Nodes: + * + * node -> {dedup: {filter: {filter}, args: {node: node, field: field}}} + * node -> {unwind: {filter: filter}, args: {node: node, field: field}} + */ +class StageDebugCmd : public Command { +public: + StageDebugCmd() : Command("stageDebug") {} + + virtual bool isWriteCommandForConfigServer() const { + return false; + } + bool slaveOk() const { + return false; + } + bool slaveOverrideOk() const { + return false; + } + void help(std::stringstream& h) const {} + + virtual void addRequiredPrivileges(const std::string& dbname, + const BSONObj& cmdObj, + std::vector<Privilege>* out) { + // Command is testing-only, and can only be enabled at command line. Hence, no auth + // check needed. + } + + bool run(OperationContext* txn, + const string& dbname, + BSONObj& cmdObj, + int, + string& errmsg, + BSONObjBuilder& result) { + BSONElement argElt = cmdObj["stageDebug"]; + if (argElt.eoo() || !argElt.isABSONObj()) { + return false; + } + BSONObj argObj = argElt.Obj(); - // Parse the plan into these. - OwnedPointerVector<MatchExpression> exprs; - unique_ptr<WorkingSet> ws(new WorkingSet()); + // Pull out the collection name. + BSONElement collElt = argObj["collection"]; + if (collElt.eoo() || (String != collElt.type())) { + return false; + } + string collName = collElt.String(); + + // Need a context to get the actual Collection* + // TODO A write lock is currently taken here to accommodate stages that perform writes + // (e.g. DeleteStage). This should be changed to use a read lock for read-only + // execution trees. + ScopedTransaction transaction(txn, MODE_IX); + Lock::DBLock lk(txn->lockState(), dbname, MODE_X); + OldClientContext ctx(txn, dbname); + + // Make sure the collection is valid. + Database* db = ctx.db(); + Collection* collection = db->getCollection(db->name() + '.' + collName); + uassert(17446, "Couldn't find the collection " + collName, NULL != collection); + + // Pull out the plan + BSONElement planElt = argObj["plan"]; + if (planElt.eoo() || !planElt.isABSONObj()) { + return false; + } + BSONObj planObj = planElt.Obj(); - PlanStage* userRoot = parseQuery(txn, collection, planObj, ws.get(), &exprs); - uassert(16911, "Couldn't parse plan from " + cmdObj.toString(), NULL != userRoot); + // Parse the plan into these. + OwnedPointerVector<MatchExpression> exprs; + unique_ptr<WorkingSet> ws(new WorkingSet()); - // Add a fetch at the top for the user so we can get obj back for sure. - // TODO: Do we want to do this for the user? I think so. - PlanStage* rootFetch = new FetchStage(txn, ws.get(), userRoot, NULL, collection); + PlanStage* userRoot = parseQuery(txn, collection, planObj, ws.get(), &exprs); + uassert(16911, "Couldn't parse plan from " + cmdObj.toString(), NULL != userRoot); - PlanExecutor* rawExec; - Status execStatus = PlanExecutor::make(txn, ws.release(), rootFetch, collection, - PlanExecutor::YIELD_AUTO, &rawExec); - fassert(28536, execStatus); - std::unique_ptr<PlanExecutor> exec(rawExec); + // Add a fetch at the top for the user so we can get obj back for sure. + // TODO: Do we want to do this for the user? I think so. + PlanStage* rootFetch = new FetchStage(txn, ws.get(), userRoot, NULL, collection); - BSONArrayBuilder resultBuilder(result.subarrayStart("results")); + PlanExecutor* rawExec; + Status execStatus = PlanExecutor::make( + txn, ws.release(), rootFetch, collection, PlanExecutor::YIELD_AUTO, &rawExec); + fassert(28536, execStatus); + std::unique_ptr<PlanExecutor> exec(rawExec); - BSONObj obj; - PlanExecutor::ExecState state; - while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { - resultBuilder.append(obj); - } + BSONArrayBuilder resultBuilder(result.subarrayStart("results")); - resultBuilder.done(); + BSONObj obj; + PlanExecutor::ExecState state; + while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { + resultBuilder.append(obj); + } - if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) { - const std::unique_ptr<PlanStageStats> stats(exec->getStats()); - error() << "Plan executor error during StageDebug command: " - << PlanExecutor::statestr(state) - << ", stats: " << Explain::statsToBSON(*stats); + resultBuilder.done(); - return appendCommandStatus(result, - Status(ErrorCodes::OperationFailed, - str::stream() - << "Executor error during " - << "StageDebug command: " - << WorkingSetCommon::toStatusString(obj))); - } + if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) { + const std::unique_ptr<PlanStageStats> stats(exec->getStats()); + error() << "Plan executor error during StageDebug command: " + << PlanExecutor::statestr(state) << ", stats: " << Explain::statsToBSON(*stats); - return true; + return appendCommandStatus( + result, + Status(ErrorCodes::OperationFailed, + str::stream() + << "Executor error during " + << "StageDebug command: " << WorkingSetCommon::toStatusString(obj))); } - PlanStage* parseQuery(OperationContext* txn, - Collection* collection, - BSONObj obj, - WorkingSet* workingSet, - OwnedPointerVector<MatchExpression>* exprs) { + return true; + } - BSONElement firstElt = obj.firstElement(); - if (!firstElt.isABSONObj()) { return NULL; } - BSONObj paramObj = firstElt.Obj(); + PlanStage* parseQuery(OperationContext* txn, + Collection* collection, + BSONObj obj, + WorkingSet* workingSet, + OwnedPointerVector<MatchExpression>* exprs) { + BSONElement firstElt = obj.firstElement(); + if (!firstElt.isABSONObj()) { + return NULL; + } + BSONObj paramObj = firstElt.Obj(); - MatchExpression* matcher = NULL; - BSONObj nodeArgs; + MatchExpression* matcher = NULL; + BSONObj nodeArgs; - // Every node has these two fields. - const string filterTag = "filter"; - const string argsTag = "args"; + // Every node has these two fields. + const string filterTag = "filter"; + const string argsTag = "args"; - BSONObjIterator it(paramObj); - while (it.more()) { - BSONElement e = it.next(); - if (!e.isABSONObj()) { return NULL; } - BSONObj argObj = e.Obj(); - if (filterTag == e.fieldName()) { - StatusWithMatchExpression swme = MatchExpressionParser::parse( - argObj, WhereCallbackReal(txn, collection->ns().db())); - if (!swme.isOK()) { return NULL; } - // exprs is what will wind up deleting this. - matcher = swme.getValue(); - verify(NULL != matcher); - exprs->mutableVector().push_back(matcher); - } - else if (argsTag == e.fieldName()) { - nodeArgs = argObj; - } - else { - uasserted(16910, "Unknown fieldname " + string(e.fieldName()) - + " in query node " + obj.toString()); + BSONObjIterator it(paramObj); + while (it.more()) { + BSONElement e = it.next(); + if (!e.isABSONObj()) { + return NULL; + } + BSONObj argObj = e.Obj(); + if (filterTag == e.fieldName()) { + StatusWithMatchExpression swme = MatchExpressionParser::parse( + argObj, WhereCallbackReal(txn, collection->ns().db())); + if (!swme.isOK()) { return NULL; } + // exprs is what will wind up deleting this. + matcher = swme.getValue(); + verify(NULL != matcher); + exprs->mutableVector().push_back(matcher); + } else if (argsTag == e.fieldName()) { + nodeArgs = argObj; + } else { + uasserted(16910, + "Unknown fieldname " + string(e.fieldName()) + " in query node " + + obj.toString()); + return NULL; } + } - string nodeName = firstElt.fieldName(); + string nodeName = firstElt.fieldName(); - if ("ixscan" == nodeName) { - // This'll throw if it's not an obj but that's OK. - BSONObj keyPatternObj = nodeArgs["keyPattern"].Obj(); + if ("ixscan" == nodeName) { + // This'll throw if it's not an obj but that's OK. + BSONObj keyPatternObj = nodeArgs["keyPattern"].Obj(); - IndexDescriptor* desc = - collection->getIndexCatalog()->findIndexByKeyPattern(txn, keyPatternObj); - uassert(16890, "Can't find index: " + keyPatternObj.toString(), desc); + IndexDescriptor* desc = + collection->getIndexCatalog()->findIndexByKeyPattern(txn, keyPatternObj); + uassert(16890, "Can't find index: " + keyPatternObj.toString(), desc); - IndexScanParams params; - params.descriptor = desc; - params.bounds.isSimpleRange = true; - params.bounds.startKey = nodeArgs["startKey"].Obj(); - params.bounds.endKey = nodeArgs["endKey"].Obj(); - params.bounds.endKeyInclusive = nodeArgs["endKeyInclusive"].Bool(); - params.direction = nodeArgs["direction"].numberInt(); + IndexScanParams params; + params.descriptor = desc; + params.bounds.isSimpleRange = true; + params.bounds.startKey = nodeArgs["startKey"].Obj(); + params.bounds.endKey = nodeArgs["endKey"].Obj(); + params.bounds.endKeyInclusive = nodeArgs["endKeyInclusive"].Bool(); + params.direction = nodeArgs["direction"].numberInt(); - return new IndexScan(txn, params, workingSet, matcher); - } - else if ("andHash" == nodeName) { - uassert(16921, "Nodes argument must be provided to AND", - nodeArgs["nodes"].isABSONObj()); - - unique_ptr<AndHashStage> andStage(new AndHashStage(workingSet, collection)); - - int nodesAdded = 0; - BSONObjIterator it(nodeArgs["nodes"].Obj()); - while (it.more()) { - BSONElement e = it.next(); - uassert(16922, "node of AND isn't an obj?: " + e.toString(), - e.isABSONObj()); - - PlanStage* subNode = parseQuery(txn, collection, e.Obj(), workingSet, exprs); - uassert(16923, "Can't parse sub-node of AND: " + e.Obj().toString(), - NULL != subNode); - // takes ownership - andStage->addChild(subNode); - ++nodesAdded; - } + return new IndexScan(txn, params, workingSet, matcher); + } else if ("andHash" == nodeName) { + uassert( + 16921, "Nodes argument must be provided to AND", nodeArgs["nodes"].isABSONObj()); - uassert(16927, "AND requires more than one child", nodesAdded >= 2); + unique_ptr<AndHashStage> andStage(new AndHashStage(workingSet, collection)); - return andStage.release(); + int nodesAdded = 0; + BSONObjIterator it(nodeArgs["nodes"].Obj()); + while (it.more()) { + BSONElement e = it.next(); + uassert(16922, "node of AND isn't an obj?: " + e.toString(), e.isABSONObj()); + + PlanStage* subNode = parseQuery(txn, collection, e.Obj(), workingSet, exprs); + uassert( + 16923, "Can't parse sub-node of AND: " + e.Obj().toString(), NULL != subNode); + // takes ownership + andStage->addChild(subNode); + ++nodesAdded; } - else if ("andSorted" == nodeName) { - uassert(16924, "Nodes argument must be provided to AND", - nodeArgs["nodes"].isABSONObj()); - - unique_ptr<AndSortedStage> andStage(new AndSortedStage(workingSet, collection)); - - int nodesAdded = 0; - BSONObjIterator it(nodeArgs["nodes"].Obj()); - while (it.more()) { - BSONElement e = it.next(); - uassert(16925, "node of AND isn't an obj?: " + e.toString(), - e.isABSONObj()); - - PlanStage* subNode = parseQuery(txn, collection, e.Obj(), workingSet, exprs); - uassert(16926, "Can't parse sub-node of AND: " + e.Obj().toString(), - NULL != subNode); - // takes ownership - andStage->addChild(subNode); - ++nodesAdded; - } - uassert(16928, "AND requires more than one child", nodesAdded >= 2); + uassert(16927, "AND requires more than one child", nodesAdded >= 2); - return andStage.release(); - } - else if ("or" == nodeName) { - uassert(16934, "Nodes argument must be provided to AND", - nodeArgs["nodes"].isABSONObj()); - uassert(16935, "Dedup argument must be provided to OR", - !nodeArgs["dedup"].eoo()); - BSONObjIterator it(nodeArgs["nodes"].Obj()); - unique_ptr<OrStage> orStage(new OrStage(workingSet, nodeArgs["dedup"].Bool(), - matcher)); - while (it.more()) { - BSONElement e = it.next(); - if (!e.isABSONObj()) { return NULL; } - PlanStage* subNode = parseQuery(txn, collection, e.Obj(), workingSet, exprs); - uassert(16936, "Can't parse sub-node of OR: " + e.Obj().toString(), - NULL != subNode); - // takes ownership - orStage->addChild(subNode); - } + return andStage.release(); + } else if ("andSorted" == nodeName) { + uassert( + 16924, "Nodes argument must be provided to AND", nodeArgs["nodes"].isABSONObj()); - return orStage.release(); - } - else if ("fetch" == nodeName) { - uassert(16929, "Node argument must be provided to fetch", - nodeArgs["node"].isABSONObj()); - PlanStage* subNode = parseQuery(txn, - collection, - nodeArgs["node"].Obj(), - workingSet, - exprs); - return new FetchStage(txn, workingSet, subNode, matcher, collection); - } - else if ("limit" == nodeName) { - uassert(16937, "Limit stage doesn't have a filter (put it on the child)", - NULL == matcher); - uassert(16930, "Node argument must be provided to limit", - nodeArgs["node"].isABSONObj()); - uassert(16931, "Num argument must be provided to limit", - nodeArgs["num"].isNumber()); - PlanStage* subNode = parseQuery(txn, - collection, - nodeArgs["node"].Obj(), - workingSet, - exprs); - return new LimitStage(nodeArgs["num"].numberInt(), workingSet, subNode); - } - else if ("skip" == nodeName) { - uassert(16938, "Skip stage doesn't have a filter (put it on the child)", - NULL == matcher); - uassert(16932, "Node argument must be provided to skip", - nodeArgs["node"].isABSONObj()); - uassert(16933, "Num argument must be provided to skip", - nodeArgs["num"].isNumber()); - PlanStage* subNode = parseQuery(txn, - collection, - nodeArgs["node"].Obj(), - workingSet, - exprs); - return new SkipStage(nodeArgs["num"].numberInt(), workingSet, subNode); + unique_ptr<AndSortedStage> andStage(new AndSortedStage(workingSet, collection)); + + int nodesAdded = 0; + BSONObjIterator it(nodeArgs["nodes"].Obj()); + while (it.more()) { + BSONElement e = it.next(); + uassert(16925, "node of AND isn't an obj?: " + e.toString(), e.isABSONObj()); + + PlanStage* subNode = parseQuery(txn, collection, e.Obj(), workingSet, exprs); + uassert( + 16926, "Can't parse sub-node of AND: " + e.Obj().toString(), NULL != subNode); + // takes ownership + andStage->addChild(subNode); + ++nodesAdded; } - else if ("cscan" == nodeName) { - CollectionScanParams params; - params.collection = collection; - - // What direction? - uassert(16963, "Direction argument must be specified and be a number", - nodeArgs["direction"].isNumber()); - if (1 == nodeArgs["direction"].numberInt()) { - params.direction = CollectionScanParams::FORWARD; - } - else { - params.direction = CollectionScanParams::BACKWARD; + + uassert(16928, "AND requires more than one child", nodesAdded >= 2); + + return andStage.release(); + } else if ("or" == nodeName) { + uassert( + 16934, "Nodes argument must be provided to AND", nodeArgs["nodes"].isABSONObj()); + uassert(16935, "Dedup argument must be provided to OR", !nodeArgs["dedup"].eoo()); + BSONObjIterator it(nodeArgs["nodes"].Obj()); + unique_ptr<OrStage> orStage(new OrStage(workingSet, nodeArgs["dedup"].Bool(), matcher)); + while (it.more()) { + BSONElement e = it.next(); + if (!e.isABSONObj()) { + return NULL; } + PlanStage* subNode = parseQuery(txn, collection, e.Obj(), workingSet, exprs); + uassert( + 16936, "Can't parse sub-node of OR: " + e.Obj().toString(), NULL != subNode); + // takes ownership + orStage->addChild(subNode); + } - return new CollectionScan(txn, params, workingSet, matcher); + return orStage.release(); + } else if ("fetch" == nodeName) { + uassert( + 16929, "Node argument must be provided to fetch", nodeArgs["node"].isABSONObj()); + PlanStage* subNode = + parseQuery(txn, collection, nodeArgs["node"].Obj(), workingSet, exprs); + return new FetchStage(txn, workingSet, subNode, matcher, collection); + } else if ("limit" == nodeName) { + uassert( + 16937, "Limit stage doesn't have a filter (put it on the child)", NULL == matcher); + uassert( + 16930, "Node argument must be provided to limit", nodeArgs["node"].isABSONObj()); + uassert(16931, "Num argument must be provided to limit", nodeArgs["num"].isNumber()); + PlanStage* subNode = + parseQuery(txn, collection, nodeArgs["node"].Obj(), workingSet, exprs); + return new LimitStage(nodeArgs["num"].numberInt(), workingSet, subNode); + } else if ("skip" == nodeName) { + uassert( + 16938, "Skip stage doesn't have a filter (put it on the child)", NULL == matcher); + uassert(16932, "Node argument must be provided to skip", nodeArgs["node"].isABSONObj()); + uassert(16933, "Num argument must be provided to skip", nodeArgs["num"].isNumber()); + PlanStage* subNode = + parseQuery(txn, collection, nodeArgs["node"].Obj(), workingSet, exprs); + return new SkipStage(nodeArgs["num"].numberInt(), workingSet, subNode); + } else if ("cscan" == nodeName) { + CollectionScanParams params; + params.collection = collection; + + // What direction? + uassert(16963, + "Direction argument must be specified and be a number", + nodeArgs["direction"].isNumber()); + if (1 == nodeArgs["direction"].numberInt()) { + params.direction = CollectionScanParams::FORWARD; + } else { + params.direction = CollectionScanParams::BACKWARD; } - // sort is disabled for now. + + return new CollectionScan(txn, params, workingSet, matcher); + } +// sort is disabled for now. #if 0 else if ("sort" == nodeName) { uassert(16969, "Node argument must be provided to sort", @@ -393,96 +383,93 @@ namespace mongo { return new SortStage(params, workingSet, subNode); } #endif - else if ("mergeSort" == nodeName) { - uassert(16971, "Nodes argument must be provided to sort", - nodeArgs["nodes"].isABSONObj()); - uassert(16972, "Pattern argument must be provided to sort", - nodeArgs["pattern"].isABSONObj()); - - MergeSortStageParams params; - params.pattern = nodeArgs["pattern"].Obj(); - // Dedup is true by default. - - unique_ptr<MergeSortStage> mergeStage(new MergeSortStage(params, workingSet, - collection)); - - BSONObjIterator it(nodeArgs["nodes"].Obj()); - while (it.more()) { - BSONElement e = it.next(); - uassert(16973, "node of mergeSort isn't an obj?: " + e.toString(), - e.isABSONObj()); - - PlanStage* subNode = parseQuery(txn, collection, e.Obj(), workingSet, exprs); - uassert(16974, "Can't parse sub-node of mergeSort: " + e.Obj().toString(), - NULL != subNode); - // takes ownership - mergeStage->addChild(subNode); - } - return mergeStage.release(); - } - else if ("text" == nodeName) { - string search = nodeArgs["search"].String(); - - vector<IndexDescriptor*> idxMatches; - collection->getIndexCatalog()->findIndexByType(txn, "text", idxMatches); - uassert(17194, "Expected exactly one text index", idxMatches.size() == 1); - - IndexDescriptor* index = idxMatches[0]; - FTSAccessMethod* fam = - dynamic_cast<FTSAccessMethod*>( collection->getIndexCatalog()->getIndex( index ) ); - TextStageParams params(fam->getSpec()); - params.index = index; - - // TODO: Deal with non-empty filters. This is a hack to put in covering information - // that can only be checked for equality. We ignore this now. - Status s = fam->getSpec().getIndexPrefix(BSONObj(), ¶ms.indexPrefix); - if (!s.isOK()) { - // errmsg = s.toString(); - return NULL; - } + else if ("mergeSort" == nodeName) { + uassert( + 16971, "Nodes argument must be provided to sort", nodeArgs["nodes"].isABSONObj()); + uassert(16972, + "Pattern argument must be provided to sort", + nodeArgs["pattern"].isABSONObj()); - params.spec = fam->getSpec(); + MergeSortStageParams params; + params.pattern = nodeArgs["pattern"].Obj(); + // Dedup is true by default. - if (!params.query.parse(search, - fam->getSpec().defaultLanguage().str().c_str(), - fts::FTSQuery::caseSensitiveDefault, - fam->getSpec().getTextIndexVersion()).isOK()) { - return NULL; - } + unique_ptr<MergeSortStage> mergeStage( + new MergeSortStage(params, workingSet, collection)); - return new TextStage(txn, params, workingSet, matcher); + BSONObjIterator it(nodeArgs["nodes"].Obj()); + while (it.more()) { + BSONElement e = it.next(); + uassert(16973, "node of mergeSort isn't an obj?: " + e.toString(), e.isABSONObj()); + + PlanStage* subNode = parseQuery(txn, collection, e.Obj(), workingSet, exprs); + uassert(16974, + "Can't parse sub-node of mergeSort: " + e.Obj().toString(), + NULL != subNode); + // takes ownership + mergeStage->addChild(subNode); } - else if ("delete" == nodeName) { - uassert(18636, "Delete stage doesn't have a filter (put it on the child)", - NULL == matcher); - uassert(18637, "node argument must be provided to delete", - nodeArgs["node"].isABSONObj()); - uassert(18638, "isMulti argument must be provided to delete", - nodeArgs["isMulti"].type() == Bool); - uassert(18639, "shouldCallLogOp argument must be provided to delete", - nodeArgs["shouldCallLogOp"].type() == Bool); - PlanStage* subNode = parseQuery(txn, - collection, - nodeArgs["node"].Obj(), - workingSet, - exprs); - DeleteStageParams params; - params.isMulti = nodeArgs["isMulti"].Bool(); - params.shouldCallLogOp = nodeArgs["shouldCallLogOp"].Bool(); - return new DeleteStage(txn, params, workingSet, collection, subNode); + return mergeStage.release(); + } else if ("text" == nodeName) { + string search = nodeArgs["search"].String(); + + vector<IndexDescriptor*> idxMatches; + collection->getIndexCatalog()->findIndexByType(txn, "text", idxMatches); + uassert(17194, "Expected exactly one text index", idxMatches.size() == 1); + + IndexDescriptor* index = idxMatches[0]; + FTSAccessMethod* fam = + dynamic_cast<FTSAccessMethod*>(collection->getIndexCatalog()->getIndex(index)); + TextStageParams params(fam->getSpec()); + params.index = index; + + // TODO: Deal with non-empty filters. This is a hack to put in covering information + // that can only be checked for equality. We ignore this now. + Status s = fam->getSpec().getIndexPrefix(BSONObj(), ¶ms.indexPrefix); + if (!s.isOK()) { + // errmsg = s.toString(); + return NULL; } - else { + + params.spec = fam->getSpec(); + + if (!params.query.parse(search, + fam->getSpec().defaultLanguage().str().c_str(), + fts::FTSQuery::caseSensitiveDefault, + fam->getSpec().getTextIndexVersion()).isOK()) { return NULL; } - } - }; - MONGO_INITIALIZER(RegisterStageDebugCmd)(InitializerContext* context) { - if (Command::testCommandsEnabled) { - // Leaked intentionally: a Command registers itself when constructed. - new StageDebugCmd(); + return new TextStage(txn, params, workingSet, matcher); + } else if ("delete" == nodeName) { + uassert( + 18636, "Delete stage doesn't have a filter (put it on the child)", NULL == matcher); + uassert( + 18637, "node argument must be provided to delete", nodeArgs["node"].isABSONObj()); + uassert(18638, + "isMulti argument must be provided to delete", + nodeArgs["isMulti"].type() == Bool); + uassert(18639, + "shouldCallLogOp argument must be provided to delete", + nodeArgs["shouldCallLogOp"].type() == Bool); + PlanStage* subNode = + parseQuery(txn, collection, nodeArgs["node"].Obj(), workingSet, exprs); + DeleteStageParams params; + params.isMulti = nodeArgs["isMulti"].Bool(); + params.shouldCallLogOp = nodeArgs["shouldCallLogOp"].Bool(); + return new DeleteStage(txn, params, workingSet, collection, subNode); + } else { + return NULL; } - return Status::OK(); } +}; + +MONGO_INITIALIZER(RegisterStageDebugCmd)(InitializerContext* context) { + if (Command::testCommandsEnabled) { + // Leaked intentionally: a Command registers itself when constructed. + new StageDebugCmd(); + } + return Status::OK(); +} } // namespace mongo diff --git a/src/mongo/db/exec/subplan.cpp b/src/mongo/db/exec/subplan.cpp index 0c64aa7b178..62daed09f32 100644 --- a/src/mongo/db/exec/subplan.cpp +++ b/src/mongo/db/exec/subplan.cpp @@ -45,513 +45,487 @@ namespace mongo { - using std::unique_ptr; - using std::endl; - using std::vector; - - // static - const char* SubplanStage::kStageType = "SUBPLAN"; - - SubplanStage::SubplanStage(OperationContext* txn, - Collection* collection, - WorkingSet* ws, - const QueryPlannerParams& params, - CanonicalQuery* cq) - : _txn(txn), - _collection(collection), - _ws(ws), - _plannerParams(params), - _query(cq), - _child(nullptr), - _commonStats(kStageType) { - invariant(_collection); +using std::unique_ptr; +using std::endl; +using std::vector; + +// static +const char* SubplanStage::kStageType = "SUBPLAN"; + +SubplanStage::SubplanStage(OperationContext* txn, + Collection* collection, + WorkingSet* ws, + const QueryPlannerParams& params, + CanonicalQuery* cq) + : _txn(txn), + _collection(collection), + _ws(ws), + _plannerParams(params), + _query(cq), + _child(nullptr), + _commonStats(kStageType) { + invariant(_collection); +} + +// static +bool SubplanStage::canUseSubplanning(const CanonicalQuery& query) { + const LiteParsedQuery& lpq = query.getParsed(); + const MatchExpression* expr = query.root(); + + // Only rooted ORs work with the subplan scheme. + if (MatchExpression::OR != expr->matchType()) { + return false; } - // static - bool SubplanStage::canUseSubplanning(const CanonicalQuery& query) { - const LiteParsedQuery& lpq = query.getParsed(); - const MatchExpression* expr = query.root(); + // Hint provided + if (!lpq.getHint().isEmpty()) { + return false; + } - // Only rooted ORs work with the subplan scheme. - if (MatchExpression::OR != expr->matchType()) { - return false; - } + // Min provided + // Min queries are a special case of hinted queries. + if (!lpq.getMin().isEmpty()) { + return false; + } - // Hint provided - if (!lpq.getHint().isEmpty()) { - return false; - } + // Max provided + // Similar to min, max queries are a special case of hinted queries. + if (!lpq.getMax().isEmpty()) { + return false; + } - // Min provided - // Min queries are a special case of hinted queries. - if (!lpq.getMin().isEmpty()) { - return false; - } + // Tailable cursors won't get cached, just turn into collscans. + if (query.getParsed().isTailable()) { + return false; + } - // Max provided - // Similar to min, max queries are a special case of hinted queries. - if (!lpq.getMax().isEmpty()) { - return false; - } + // Snapshot is really a hint. + if (query.getParsed().isSnapshot()) { + return false; + } - // Tailable cursors won't get cached, just turn into collscans. - if (query.getParsed().isTailable()) { - return false; - } + return true; +} - // Snapshot is really a hint. - if (query.getParsed().isSnapshot()) { - return false; - } +Status SubplanStage::planSubqueries() { + // Adds the amount of time taken by planSubqueries() to executionTimeMillis. There's lots of + // work that happens here, so this is needed for the time accounting to make sense. + ScopedTimer timer(&_commonStats.executionTimeMillis); - return true; + MatchExpression* orExpr = _query->root(); + + for (size_t i = 0; i < _plannerParams.indices.size(); ++i) { + const IndexEntry& ie = _plannerParams.indices[i]; + _indexMap[ie.keyPattern] = i; + LOG(5) << "Subplanner: index " << i << " is " << ie.toString() << endl; } - Status SubplanStage::planSubqueries() { - // Adds the amount of time taken by planSubqueries() to executionTimeMillis. There's lots of - // work that happens here, so this is needed for the time accounting to make sense. - ScopedTimer timer(&_commonStats.executionTimeMillis); + const WhereCallbackReal whereCallback(_txn, _collection->ns().db()); - MatchExpression* orExpr = _query->root(); + for (size_t i = 0; i < orExpr->numChildren(); ++i) { + // We need a place to shove the results from planning this branch. + _branchResults.push_back(new BranchPlanningResult()); + BranchPlanningResult* branchResult = _branchResults.back(); - for (size_t i = 0; i < _plannerParams.indices.size(); ++i) { - const IndexEntry& ie = _plannerParams.indices[i]; - _indexMap[ie.keyPattern] = i; - LOG(5) << "Subplanner: index " << i << " is " << ie.toString() << endl; - } + MatchExpression* orChild = orExpr->getChild(i); - const WhereCallbackReal whereCallback(_txn, _collection->ns().db()); - - for (size_t i = 0; i < orExpr->numChildren(); ++i) { - // We need a place to shove the results from planning this branch. - _branchResults.push_back(new BranchPlanningResult()); - BranchPlanningResult* branchResult = _branchResults.back(); - - MatchExpression* orChild = orExpr->getChild(i); - - // Turn the i-th child into its own query. - { - CanonicalQuery* orChildCQ; - Status childCQStatus = CanonicalQuery::canonicalize(*_query, - orChild, - &orChildCQ, - whereCallback); - if (!childCQStatus.isOK()) { - mongoutils::str::stream ss; - ss << "Can't canonicalize subchild " << orChild->toString() - << " " << childCQStatus.reason(); - return Status(ErrorCodes::BadValue, ss); - } - - branchResult->canonicalQuery.reset(orChildCQ); + // Turn the i-th child into its own query. + { + CanonicalQuery* orChildCQ; + Status childCQStatus = + CanonicalQuery::canonicalize(*_query, orChild, &orChildCQ, whereCallback); + if (!childCQStatus.isOK()) { + mongoutils::str::stream ss; + ss << "Can't canonicalize subchild " << orChild->toString() << " " + << childCQStatus.reason(); + return Status(ErrorCodes::BadValue, ss); } - // Plan the i-th child. We might be able to find a plan for the i-th child in the plan - // cache. If there's no cached plan, then we generate and rank plans using the MPS. - CachedSolution* rawCS; - if (PlanCache::shouldCacheQuery(*branchResult->canonicalQuery.get()) && - _collection->infoCache()->getPlanCache()->get(*branchResult->canonicalQuery.get(), - &rawCS).isOK()) { - // We have a CachedSolution. Store it for later. - LOG(5) << "Subplanner: cached plan found for child " << i << " of " - << orExpr->numChildren(); - - branchResult->cachedSolution.reset(rawCS); - } - else { - // No CachedSolution found. We'll have to plan from scratch. - LOG(5) << "Subplanner: planning child " << i << " of " << orExpr->numChildren(); - - // We don't set NO_TABLE_SCAN because peeking at the cache data will keep us from - // considering any plan that's a collscan. - Status status = QueryPlanner::plan(*branchResult->canonicalQuery.get(), - _plannerParams, - &branchResult->solutions.mutableVector()); - - if (!status.isOK()) { - mongoutils::str::stream ss; - ss << "Can't plan for subchild " - << branchResult->canonicalQuery->toString() - << " " << status.reason(); - return Status(ErrorCodes::BadValue, ss); - } - LOG(5) << "Subplanner: got " << branchResult->solutions.size() << " solutions"; - - if (0 == branchResult->solutions.size()) { - // If one child doesn't have an indexed solution, bail out. - mongoutils::str::stream ss; - ss << "No solutions for subchild " << branchResult->canonicalQuery->toString(); - return Status(ErrorCodes::BadValue, ss); - } - } + branchResult->canonicalQuery.reset(orChildCQ); } - return Status::OK(); - } - - namespace { - - /** - * On success, applies the index tags from 'branchCacheData' (which represent the winning - * plan for 'orChild') to 'compositeCacheData'. - */ - Status tagOrChildAccordingToCache(PlanCacheIndexTree* compositeCacheData, - SolutionCacheData* branchCacheData, - MatchExpression* orChild, - const std::map<BSONObj, size_t>& indexMap) { - invariant(compositeCacheData); - - // We want a well-formed *indexed* solution. - if (NULL == branchCacheData) { - // For example, we don't cache things for 2d indices. + // Plan the i-th child. We might be able to find a plan for the i-th child in the plan + // cache. If there's no cached plan, then we generate and rank plans using the MPS. + CachedSolution* rawCS; + if (PlanCache::shouldCacheQuery(*branchResult->canonicalQuery.get()) && + _collection->infoCache() + ->getPlanCache() + ->get(*branchResult->canonicalQuery.get(), &rawCS) + .isOK()) { + // We have a CachedSolution. Store it for later. + LOG(5) << "Subplanner: cached plan found for child " << i << " of " + << orExpr->numChildren(); + + branchResult->cachedSolution.reset(rawCS); + } else { + // No CachedSolution found. We'll have to plan from scratch. + LOG(5) << "Subplanner: planning child " << i << " of " << orExpr->numChildren(); + + // We don't set NO_TABLE_SCAN because peeking at the cache data will keep us from + // considering any plan that's a collscan. + Status status = QueryPlanner::plan(*branchResult->canonicalQuery.get(), + _plannerParams, + &branchResult->solutions.mutableVector()); + + if (!status.isOK()) { mongoutils::str::stream ss; - ss << "No cache data for subchild " << orChild->toString(); + ss << "Can't plan for subchild " << branchResult->canonicalQuery->toString() << " " + << status.reason(); return Status(ErrorCodes::BadValue, ss); } + LOG(5) << "Subplanner: got " << branchResult->solutions.size() << " solutions"; - if (SolutionCacheData::USE_INDEX_TAGS_SOLN != branchCacheData->solnType) { + if (0 == branchResult->solutions.size()) { + // If one child doesn't have an indexed solution, bail out. mongoutils::str::stream ss; - ss << "No indexed cache data for subchild " - << orChild->toString(); + ss << "No solutions for subchild " << branchResult->canonicalQuery->toString(); return Status(ErrorCodes::BadValue, ss); } + } + } - // Add the index assignments to our original query. - Status tagStatus = QueryPlanner::tagAccordingToCache(orChild, - branchCacheData->tree.get(), - indexMap); + return Status::OK(); +} + +namespace { +/** + * On success, applies the index tags from 'branchCacheData' (which represent the winning + * plan for 'orChild') to 'compositeCacheData'. + */ +Status tagOrChildAccordingToCache(PlanCacheIndexTree* compositeCacheData, + SolutionCacheData* branchCacheData, + MatchExpression* orChild, + const std::map<BSONObj, size_t>& indexMap) { + invariant(compositeCacheData); + + // We want a well-formed *indexed* solution. + if (NULL == branchCacheData) { + // For example, we don't cache things for 2d indices. + mongoutils::str::stream ss; + ss << "No cache data for subchild " << orChild->toString(); + return Status(ErrorCodes::BadValue, ss); + } + + if (SolutionCacheData::USE_INDEX_TAGS_SOLN != branchCacheData->solnType) { + mongoutils::str::stream ss; + ss << "No indexed cache data for subchild " << orChild->toString(); + return Status(ErrorCodes::BadValue, ss); + } + + // Add the index assignments to our original query. + Status tagStatus = + QueryPlanner::tagAccordingToCache(orChild, branchCacheData->tree.get(), indexMap); + + if (!tagStatus.isOK()) { + mongoutils::str::stream ss; + ss << "Failed to extract indices from subchild " << orChild->toString(); + return Status(ErrorCodes::BadValue, ss); + } + + // Add the child's cache data to the cache data we're creating for the main query. + compositeCacheData->children.push_back(branchCacheData->tree->clone()); + + return Status::OK(); +} + +} // namespace + +Status SubplanStage::choosePlanForSubqueries(PlanYieldPolicy* yieldPolicy) { + // This is what we annotate with the index selections and then turn into a solution. + unique_ptr<OrMatchExpression> orExpr( + static_cast<OrMatchExpression*>(_query->root()->shallowClone())); + + // This is the skeleton of index selections that is inserted into the cache. + unique_ptr<PlanCacheIndexTree> cacheData(new PlanCacheIndexTree()); + + for (size_t i = 0; i < orExpr->numChildren(); ++i) { + MatchExpression* orChild = orExpr->getChild(i); + BranchPlanningResult* branchResult = _branchResults[i]; + + if (branchResult->cachedSolution.get()) { + // We can get the index tags we need out of the cache. + Status tagStatus = tagOrChildAccordingToCache( + cacheData.get(), branchResult->cachedSolution->plannerData[0], orChild, _indexMap); if (!tagStatus.isOK()) { - mongoutils::str::stream ss; - ss << "Failed to extract indices from subchild " - << orChild->toString(); - return Status(ErrorCodes::BadValue, ss); + return tagStatus; } + } else if (1 == branchResult->solutions.size()) { + QuerySolution* soln = branchResult->solutions.front(); + Status tagStatus = tagOrChildAccordingToCache( + cacheData.get(), soln->cacheData.get(), orChild, _indexMap); + if (!tagStatus.isOK()) { + return tagStatus; + } + } else { + // N solutions, rank them. - // Add the child's cache data to the cache data we're creating for the main query. - compositeCacheData->children.push_back(branchCacheData->tree->clone()); + // We already checked for zero solutions in planSubqueries(...). + invariant(!branchResult->solutions.empty()); - return Status::OK(); - } + _ws->clear(); + + _child.reset(new MultiPlanStage(_txn, _collection, branchResult->canonicalQuery.get())); + MultiPlanStage* multiPlanStage = static_cast<MultiPlanStage*>(_child.get()); + + // Dump all the solutions into the MPS. + for (size_t ix = 0; ix < branchResult->solutions.size(); ++ix) { + PlanStage* nextPlanRoot; + invariant(StageBuilder::build( + _txn, _collection, *branchResult->solutions[ix], _ws, &nextPlanRoot)); - } // namespace - - Status SubplanStage::choosePlanForSubqueries(PlanYieldPolicy* yieldPolicy) { - // This is what we annotate with the index selections and then turn into a solution. - unique_ptr<OrMatchExpression> orExpr( - static_cast<OrMatchExpression*>(_query->root()->shallowClone())); - - // This is the skeleton of index selections that is inserted into the cache. - unique_ptr<PlanCacheIndexTree> cacheData(new PlanCacheIndexTree()); - - for (size_t i = 0; i < orExpr->numChildren(); ++i) { - MatchExpression* orChild = orExpr->getChild(i); - BranchPlanningResult* branchResult = _branchResults[i]; - - if (branchResult->cachedSolution.get()) { - // We can get the index tags we need out of the cache. - Status tagStatus = tagOrChildAccordingToCache( - cacheData.get(), - branchResult->cachedSolution->plannerData[0], - orChild, - _indexMap); - if (!tagStatus.isOK()) { - return tagStatus; - } + // Takes ownership of solution with index 'ix' and 'nextPlanRoot'. + multiPlanStage->addPlan(branchResult->solutions.releaseAt(ix), nextPlanRoot, _ws); } - else if (1 == branchResult->solutions.size()) { - QuerySolution* soln = branchResult->solutions.front(); - Status tagStatus = tagOrChildAccordingToCache(cacheData.get(), - soln->cacheData.get(), - orChild, - _indexMap); - if (!tagStatus.isOK()) { - return tagStatus; - } + + Status planSelectStat = multiPlanStage->pickBestPlan(yieldPolicy); + if (!planSelectStat.isOK()) { + return planSelectStat; } - else { - // N solutions, rank them. - - // We already checked for zero solutions in planSubqueries(...). - invariant(!branchResult->solutions.empty()); - - _ws->clear(); - - _child.reset(new MultiPlanStage(_txn, _collection, - branchResult->canonicalQuery.get())); - MultiPlanStage* multiPlanStage = static_cast<MultiPlanStage*>(_child.get()); - - // Dump all the solutions into the MPS. - for (size_t ix = 0; ix < branchResult->solutions.size(); ++ix) { - PlanStage* nextPlanRoot; - invariant(StageBuilder::build(_txn, - _collection, - *branchResult->solutions[ix], - _ws, - &nextPlanRoot)); - - // Takes ownership of solution with index 'ix' and 'nextPlanRoot'. - multiPlanStage->addPlan(branchResult->solutions.releaseAt(ix), - nextPlanRoot, - _ws); - } - - Status planSelectStat = multiPlanStage->pickBestPlan(yieldPolicy); - if (!planSelectStat.isOK()) { - return planSelectStat; - } - - if (!multiPlanStage->bestPlanChosen()) { - mongoutils::str::stream ss; - ss << "Failed to pick best plan for subchild " - << branchResult->canonicalQuery->toString(); - return Status(ErrorCodes::BadValue, ss); - } - - QuerySolution* bestSoln = multiPlanStage->bestSolution(); - - // Check that we have good cache data. For example, we don't cache things - // for 2d indices. - if (NULL == bestSoln->cacheData.get()) { - mongoutils::str::stream ss; - ss << "No cache data for subchild " << orChild->toString(); - return Status(ErrorCodes::BadValue, ss); - } - - if (SolutionCacheData::USE_INDEX_TAGS_SOLN != bestSoln->cacheData->solnType) { - mongoutils::str::stream ss; - ss << "No indexed cache data for subchild " - << orChild->toString(); - return Status(ErrorCodes::BadValue, ss); - } - - // Add the index assignments to our original query. - Status tagStatus = QueryPlanner::tagAccordingToCache( - orChild, bestSoln->cacheData->tree.get(), _indexMap); - - if (!tagStatus.isOK()) { - mongoutils::str::stream ss; - ss << "Failed to extract indices from subchild " - << orChild->toString(); - return Status(ErrorCodes::BadValue, ss); - } - - cacheData->children.push_back(bestSoln->cacheData->tree->clone()); + + if (!multiPlanStage->bestPlanChosen()) { + mongoutils::str::stream ss; + ss << "Failed to pick best plan for subchild " + << branchResult->canonicalQuery->toString(); + return Status(ErrorCodes::BadValue, ss); } - } - // Must do this before using the planner functionality. - sortUsingTags(orExpr.get()); + QuerySolution* bestSoln = multiPlanStage->bestSolution(); - // Use the cached index assignments to build solnRoot. Takes ownership of 'orExpr'. - QuerySolutionNode* solnRoot = QueryPlannerAccess::buildIndexedDataAccess( - *_query, orExpr.release(), false, _plannerParams.indices, _plannerParams); + // Check that we have good cache data. For example, we don't cache things + // for 2d indices. + if (NULL == bestSoln->cacheData.get()) { + mongoutils::str::stream ss; + ss << "No cache data for subchild " << orChild->toString(); + return Status(ErrorCodes::BadValue, ss); + } - if (NULL == solnRoot) { - mongoutils::str::stream ss; - ss << "Failed to build indexed data path for subplanned query\n"; - return Status(ErrorCodes::BadValue, ss); - } + if (SolutionCacheData::USE_INDEX_TAGS_SOLN != bestSoln->cacheData->solnType) { + mongoutils::str::stream ss; + ss << "No indexed cache data for subchild " << orChild->toString(); + return Status(ErrorCodes::BadValue, ss); + } - LOG(5) << "Subplanner: fully tagged tree is " << solnRoot->toString(); + // Add the index assignments to our original query. + Status tagStatus = QueryPlanner::tagAccordingToCache( + orChild, bestSoln->cacheData->tree.get(), _indexMap); - // Takes ownership of 'solnRoot' - _compositeSolution.reset(QueryPlannerAnalysis::analyzeDataAccess(*_query, - _plannerParams, - solnRoot)); + if (!tagStatus.isOK()) { + mongoutils::str::stream ss; + ss << "Failed to extract indices from subchild " << orChild->toString(); + return Status(ErrorCodes::BadValue, ss); + } - if (NULL == _compositeSolution.get()) { - mongoutils::str::stream ss; - ss << "Failed to analyze subplanned query"; - return Status(ErrorCodes::BadValue, ss); + cacheData->children.push_back(bestSoln->cacheData->tree->clone()); } + } - LOG(5) << "Subplanner: Composite solution is " << _compositeSolution->toString() << endl; + // Must do this before using the planner functionality. + sortUsingTags(orExpr.get()); - // Use the index tags from planning each branch to construct the composite solution, - // and set that solution as our child stage. - _ws->clear(); - PlanStage* root; - invariant(StageBuilder::build(_txn, _collection, *_compositeSolution.get(), _ws, &root)); - _child.reset(root); + // Use the cached index assignments to build solnRoot. Takes ownership of 'orExpr'. + QuerySolutionNode* solnRoot = QueryPlannerAccess::buildIndexedDataAccess( + *_query, orExpr.release(), false, _plannerParams.indices, _plannerParams); - return Status::OK(); + if (NULL == solnRoot) { + mongoutils::str::stream ss; + ss << "Failed to build indexed data path for subplanned query\n"; + return Status(ErrorCodes::BadValue, ss); } - Status SubplanStage::choosePlanWholeQuery(PlanYieldPolicy* yieldPolicy) { - // Clear out the working set. We'll start with a fresh working set. - _ws->clear(); + LOG(5) << "Subplanner: fully tagged tree is " << solnRoot->toString(); - // Use the query planning module to plan the whole query. - vector<QuerySolution*> rawSolutions; - Status status = QueryPlanner::plan(*_query, _plannerParams, &rawSolutions); - if (!status.isOK()) { - return Status(ErrorCodes::BadValue, - "error processing query: " + _query->toString() + - " planner returned error: " + status.reason()); - } + // Takes ownership of 'solnRoot' + _compositeSolution.reset( + QueryPlannerAnalysis::analyzeDataAccess(*_query, _plannerParams, solnRoot)); - OwnedPointerVector<QuerySolution> solutions(rawSolutions); + if (NULL == _compositeSolution.get()) { + mongoutils::str::stream ss; + ss << "Failed to analyze subplanned query"; + return Status(ErrorCodes::BadValue, ss); + } - // We cannot figure out how to answer the query. Perhaps it requires an index - // we do not have? - if (0 == solutions.size()) { - return Status(ErrorCodes::BadValue, - str::stream() - << "error processing query: " - << _query->toString() - << " No query solutions"); - } + LOG(5) << "Subplanner: Composite solution is " << _compositeSolution->toString() << endl; - if (1 == solutions.size()) { - PlanStage* root; - // Only one possible plan. Run it. Build the stages from the solution. - verify(StageBuilder::build(_txn, _collection, *solutions[0], _ws, &root)); - _child.reset(root); + // Use the index tags from planning each branch to construct the composite solution, + // and set that solution as our child stage. + _ws->clear(); + PlanStage* root; + invariant(StageBuilder::build(_txn, _collection, *_compositeSolution.get(), _ws, &root)); + _child.reset(root); - // This SubplanStage takes ownership of the query solution. - _compositeSolution.reset(solutions.popAndReleaseBack()); + return Status::OK(); +} - return Status::OK(); - } - else { - // Many solutions. Create a MultiPlanStage to pick the best, update the cache, - // and so on. The working set will be shared by all candidate plans. - _child.reset(new MultiPlanStage(_txn, _collection, _query)); - MultiPlanStage* multiPlanStage = static_cast<MultiPlanStage*>(_child.get()); +Status SubplanStage::choosePlanWholeQuery(PlanYieldPolicy* yieldPolicy) { + // Clear out the working set. We'll start with a fresh working set. + _ws->clear(); + + // Use the query planning module to plan the whole query. + vector<QuerySolution*> rawSolutions; + Status status = QueryPlanner::plan(*_query, _plannerParams, &rawSolutions); + if (!status.isOK()) { + return Status(ErrorCodes::BadValue, + "error processing query: " + _query->toString() + + " planner returned error: " + status.reason()); + } - for (size_t ix = 0; ix < solutions.size(); ++ix) { - if (solutions[ix]->cacheData.get()) { - solutions[ix]->cacheData->indexFilterApplied = - _plannerParams.indexFiltersApplied; - } + OwnedPointerVector<QuerySolution> solutions(rawSolutions); - // version of StageBuild::build when WorkingSet is shared - PlanStage* nextPlanRoot; - verify(StageBuilder::build(_txn, _collection, *solutions[ix], _ws, - &nextPlanRoot)); + // We cannot figure out how to answer the query. Perhaps it requires an index + // we do not have? + if (0 == solutions.size()) { + return Status(ErrorCodes::BadValue, + str::stream() << "error processing query: " << _query->toString() + << " No query solutions"); + } - // Takes ownership of 'solutions[ix]' and 'nextPlanRoot'. - multiPlanStage->addPlan(solutions.releaseAt(ix), nextPlanRoot, _ws); - } + if (1 == solutions.size()) { + PlanStage* root; + // Only one possible plan. Run it. Build the stages from the solution. + verify(StageBuilder::build(_txn, _collection, *solutions[0], _ws, &root)); + _child.reset(root); - // Delegate the the MultiPlanStage's plan selection facility. - Status planSelectStat = multiPlanStage->pickBestPlan(yieldPolicy); - if (!planSelectStat.isOK()) { - return planSelectStat; - } + // This SubplanStage takes ownership of the query solution. + _compositeSolution.reset(solutions.popAndReleaseBack()); - return Status::OK(); - } - } + return Status::OK(); + } else { + // Many solutions. Create a MultiPlanStage to pick the best, update the cache, + // and so on. The working set will be shared by all candidate plans. + _child.reset(new MultiPlanStage(_txn, _collection, _query)); + MultiPlanStage* multiPlanStage = static_cast<MultiPlanStage*>(_child.get()); + + for (size_t ix = 0; ix < solutions.size(); ++ix) { + if (solutions[ix]->cacheData.get()) { + solutions[ix]->cacheData->indexFilterApplied = _plannerParams.indexFiltersApplied; + } - Status SubplanStage::pickBestPlan(PlanYieldPolicy* yieldPolicy) { - // Adds the amount of time taken by pickBestPlan() to executionTimeMillis. There's lots of - // work that happens here, so this is needed for the time accounting to make sense. - ScopedTimer timer(&_commonStats.executionTimeMillis); + // version of StageBuild::build when WorkingSet is shared + PlanStage* nextPlanRoot; + verify(StageBuilder::build(_txn, _collection, *solutions[ix], _ws, &nextPlanRoot)); - // Plan each branch of the $or. - Status subplanningStatus = planSubqueries(); - if (!subplanningStatus.isOK()) { - return choosePlanWholeQuery(yieldPolicy); + // Takes ownership of 'solutions[ix]' and 'nextPlanRoot'. + multiPlanStage->addPlan(solutions.releaseAt(ix), nextPlanRoot, _ws); } - // Use the multi plan stage to select a winning plan for each branch, and then construct - // the overall winning plan from the resulting index tags. - Status subplanSelectStat = choosePlanForSubqueries(yieldPolicy); - if (!subplanSelectStat.isOK()) { - return choosePlanWholeQuery(yieldPolicy); + // Delegate the the MultiPlanStage's plan selection facility. + Status planSelectStat = multiPlanStage->pickBestPlan(yieldPolicy); + if (!planSelectStat.isOK()) { + return planSelectStat; } return Status::OK(); } +} - bool SubplanStage::isEOF() { - // If we're running we best have a runner. - invariant(_child.get()); - return _child->isEOF(); +Status SubplanStage::pickBestPlan(PlanYieldPolicy* yieldPolicy) { + // Adds the amount of time taken by pickBestPlan() to executionTimeMillis. There's lots of + // work that happens here, so this is needed for the time accounting to make sense. + ScopedTimer timer(&_commonStats.executionTimeMillis); + + // Plan each branch of the $or. + Status subplanningStatus = planSubqueries(); + if (!subplanningStatus.isOK()) { + return choosePlanWholeQuery(yieldPolicy); } - PlanStage::StageState SubplanStage::work(WorkingSetID* out) { - ++_commonStats.works; + // Use the multi plan stage to select a winning plan for each branch, and then construct + // the overall winning plan from the resulting index tags. + Status subplanSelectStat = choosePlanForSubqueries(yieldPolicy); + if (!subplanSelectStat.isOK()) { + return choosePlanWholeQuery(yieldPolicy); + } - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); + return Status::OK(); +} - if (isEOF()) { return PlanStage::IS_EOF; } +bool SubplanStage::isEOF() { + // If we're running we best have a runner. + invariant(_child.get()); + return _child->isEOF(); +} - invariant(_child.get()); - StageState state = _child->work(out); +PlanStage::StageState SubplanStage::work(WorkingSetID* out) { + ++_commonStats.works; - if (PlanStage::NEED_TIME == state) { - ++_commonStats.needTime; - } - else if (PlanStage::NEED_YIELD == state) { - ++_commonStats.needYield; - } - else if (PlanStage::ADVANCED == state) { - ++_commonStats.advanced; - } + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); - return state; + if (isEOF()) { + return PlanStage::IS_EOF; } - void SubplanStage::saveState() { - _txn = NULL; - ++_commonStats.yields; + invariant(_child.get()); + StageState state = _child->work(out); - // We're ranking a sub-plan via an MPS or we're streaming results from this stage. Either - // way, pass on the request. - if (NULL != _child.get()) { - _child->saveState(); - } + if (PlanStage::NEED_TIME == state) { + ++_commonStats.needTime; + } else if (PlanStage::NEED_YIELD == state) { + ++_commonStats.needYield; + } else if (PlanStage::ADVANCED == state) { + ++_commonStats.advanced; } - void SubplanStage::restoreState(OperationContext* opCtx) { - invariant(_txn == NULL); - _txn = opCtx; - ++_commonStats.unyields; + return state; +} - // We're ranking a sub-plan via an MPS or we're streaming results from this stage. Either - // way, pass on the request. - if (NULL != _child.get()) { - _child->restoreState(opCtx); - } - } - - void SubplanStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { - ++_commonStats.invalidates; +void SubplanStage::saveState() { + _txn = NULL; + ++_commonStats.yields; - if (NULL != _child.get()) { - _child->invalidate(txn, dl, type); - } + // We're ranking a sub-plan via an MPS or we're streaming results from this stage. Either + // way, pass on the request. + if (NULL != _child.get()) { + _child->saveState(); } +} - vector<PlanStage*> SubplanStage::getChildren() const { - vector<PlanStage*> children; - if (NULL != _child.get()) { - children.push_back(_child.get()); - } - return children; - } +void SubplanStage::restoreState(OperationContext* opCtx) { + invariant(_txn == NULL); + _txn = opCtx; + ++_commonStats.unyields; - PlanStageStats* SubplanStage::getStats() { - _commonStats.isEOF = isEOF(); - unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_SUBPLAN)); - ret->children.push_back(_child->getStats()); - return ret.release(); + // We're ranking a sub-plan via an MPS or we're streaming results from this stage. Either + // way, pass on the request. + if (NULL != _child.get()) { + _child->restoreState(opCtx); } +} - bool SubplanStage::branchPlannedFromCache(size_t i) const { - return NULL != _branchResults[i]->cachedSolution.get(); - } +void SubplanStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + ++_commonStats.invalidates; - const CommonStats* SubplanStage::getCommonStats() const { - return &_commonStats; + if (NULL != _child.get()) { + _child->invalidate(txn, dl, type); } +} - const SpecificStats* SubplanStage::getSpecificStats() const { - return NULL; +vector<PlanStage*> SubplanStage::getChildren() const { + vector<PlanStage*> children; + if (NULL != _child.get()) { + children.push_back(_child.get()); } + return children; +} + +PlanStageStats* SubplanStage::getStats() { + _commonStats.isEOF = isEOF(); + unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_SUBPLAN)); + ret->children.push_back(_child->getStats()); + return ret.release(); +} + +bool SubplanStage::branchPlannedFromCache(size_t i) const { + return NULL != _branchResults[i]->cachedSolution.get(); +} + +const CommonStats* SubplanStage::getCommonStats() const { + return &_commonStats; +} + +const SpecificStats* SubplanStage::getSpecificStats() const { + return NULL; +} } // namespace mongo diff --git a/src/mongo/db/exec/subplan.h b/src/mongo/db/exec/subplan.h index f46a25b3bc8..ca831a1856e 100644 --- a/src/mongo/db/exec/subplan.h +++ b/src/mongo/db/exec/subplan.h @@ -42,154 +42,157 @@ namespace mongo { - class OperationContext; +class OperationContext; + +/** + * The SubplanStage is used for rooted $or queries. It plans each clause of the $or + * individually, and then creates an overall query plan based on the winning plan from + * each clause. + * + * Uses the MultiPlanStage in order to rank plans for the individual clauses. + * + * Notes on caching strategy: + * + * --Interaction with the plan cache is done on a per-clause basis. For a given clause C, + * if there is a plan in the cache for shape C, then C is planned using the index tags + * obtained from the plan cache entry. If no cached plan is found for C, then a MultiPlanStage + * is used to determine the best plan for the clause; unless there is a tie between multiple + * candidate plans, the winner is inserted into the plan cache and used to plan subsequent + * executions of C. These subsequent executions of shape C could be either as a clause in + * another rooted $or query, or shape C as its own query. + * + * --Plans for entire rooted $or queries are neither written to nor read from the plan cache. + */ +class SubplanStage : public PlanStage { +public: + SubplanStage(OperationContext* txn, + Collection* collection, + WorkingSet* ws, + const QueryPlannerParams& params, + CanonicalQuery* cq); + + static bool canUseSubplanning(const CanonicalQuery& query); + + virtual bool isEOF(); + virtual StageState work(WorkingSetID* out); + + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + + virtual std::vector<PlanStage*> getChildren() const; + + virtual StageType stageType() const { + return STAGE_SUBPLAN; + } + + PlanStageStats* getStats(); + + virtual const CommonStats* getCommonStats() const; + + virtual const SpecificStats* getSpecificStats() const; + + static const char* kStageType; /** - * The SubplanStage is used for rooted $or queries. It plans each clause of the $or - * individually, and then creates an overall query plan based on the winning plan from - * each clause. + * Selects a plan using subplanning. First uses the query planning results from + * planSubqueries() and the multi plan stage to select the best plan for each branch. * - * Uses the MultiPlanStage in order to rank plans for the individual clauses. + * If this effort fails, then falls back on planning the whole query normally rather + * then planning $or branches independently. * - * Notes on caching strategy: + * If 'yieldPolicy' is non-NULL, then all locks may be yielded in between round-robin + * works of the candidate plans. By default, 'yieldPolicy' is NULL and no yielding will + * take place. * - * --Interaction with the plan cache is done on a per-clause basis. For a given clause C, - * if there is a plan in the cache for shape C, then C is planned using the index tags - * obtained from the plan cache entry. If no cached plan is found for C, then a MultiPlanStage - * is used to determine the best plan for the clause; unless there is a tie between multiple - * candidate plans, the winner is inserted into the plan cache and used to plan subsequent - * executions of C. These subsequent executions of shape C could be either as a clause in - * another rooted $or query, or shape C as its own query. - * - * --Plans for entire rooted $or queries are neither written to nor read from the plan cache. + * Returns a non-OK status if the plan was killed during yield or if planning fails. + */ + Status pickBestPlan(PlanYieldPolicy* yieldPolicy); + + // + // For testing. + // + + /** + * Returns true if the i-th branch was planned by retrieving a cached solution, + * otherwise returns false. + */ + bool branchPlannedFromCache(size_t i) const; + +private: + /** + * A class used internally in order to keep track of the results of planning + * a particular $or branch. */ - class SubplanStage : public PlanStage { + struct BranchPlanningResult { + MONGO_DISALLOW_COPYING(BranchPlanningResult); + public: - SubplanStage(OperationContext* txn, - Collection* collection, - WorkingSet* ws, - const QueryPlannerParams& params, - CanonicalQuery* cq); - - static bool canUseSubplanning(const CanonicalQuery& query); - - virtual bool isEOF(); - virtual StageState work(WorkingSetID* out); - - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - - virtual std::vector<PlanStage*> getChildren() const; - - virtual StageType stageType() const { return STAGE_SUBPLAN; } - - PlanStageStats* getStats(); - - virtual const CommonStats* getCommonStats() const; - - virtual const SpecificStats* getSpecificStats() const; - - static const char* kStageType; - - /** - * Selects a plan using subplanning. First uses the query planning results from - * planSubqueries() and the multi plan stage to select the best plan for each branch. - * - * If this effort fails, then falls back on planning the whole query normally rather - * then planning $or branches independently. - * - * If 'yieldPolicy' is non-NULL, then all locks may be yielded in between round-robin - * works of the candidate plans. By default, 'yieldPolicy' is NULL and no yielding will - * take place. - * - * Returns a non-OK status if the plan was killed during yield or if planning fails. - */ - Status pickBestPlan(PlanYieldPolicy* yieldPolicy); - - // - // For testing. - // - - /** - * Returns true if the i-th branch was planned by retrieving a cached solution, - * otherwise returns false. - */ - bool branchPlannedFromCache(size_t i) const; - - private: - /** - * A class used internally in order to keep track of the results of planning - * a particular $or branch. - */ - struct BranchPlanningResult { - MONGO_DISALLOW_COPYING(BranchPlanningResult); - public: - BranchPlanningResult() { } - - // A parsed version of one branch of the $or. - std::unique_ptr<CanonicalQuery> canonicalQuery; - - // If there is cache data available, then we store it here rather than generating - // a set of alternate plans for the branch. The index tags from the cache data - // can be applied directly to the parent $or MatchExpression when generating the - // composite solution. - std::unique_ptr<CachedSolution> cachedSolution; - - // Query solutions resulting from planning the $or branch. - OwnedPointerVector<QuerySolution> solutions; - }; - - /** - * Plan each branch of the $or independently, and store the resulting - * lists of query solutions in '_solutions'. - * - * Called from SubplanStage::make so that construction of the subplan stage - * fails immediately, rather than returning a plan executor and subsequently - * through getNext(...). - */ - Status planSubqueries(); - - /** - * Uses the query planning results from planSubqueries() and the multi plan stage - * to select the best plan for each branch. - * - * Helper for pickBestPlan(). - */ - Status choosePlanForSubqueries(PlanYieldPolicy* yieldPolicy); - - /** - * Used as a fallback if subplanning fails. Helper for pickBestPlan(). - */ - Status choosePlanWholeQuery(PlanYieldPolicy* yieldPolicy); - - // transactional context for read locks. Not owned by us - OperationContext* _txn; - - // Not owned here. Must be non-null. - Collection* _collection; - - // Not owned here. - WorkingSet* _ws; - - QueryPlannerParams _plannerParams; - - // Not owned here. - CanonicalQuery* _query; - - // If we successfully create a "composite solution" by planning each $or branch - // independently, that solution is owned here. - std::unique_ptr<QuerySolution> _compositeSolution; - - std::unique_ptr<PlanStage> _child; - - // Holds a list of the results from planning each branch. - OwnedPointerVector<BranchPlanningResult> _branchResults; - - // We need this to extract cache-friendly index data from the index assignments. - std::map<BSONObj, size_t> _indexMap; - - CommonStats _commonStats; + BranchPlanningResult() {} + + // A parsed version of one branch of the $or. + std::unique_ptr<CanonicalQuery> canonicalQuery; + + // If there is cache data available, then we store it here rather than generating + // a set of alternate plans for the branch. The index tags from the cache data + // can be applied directly to the parent $or MatchExpression when generating the + // composite solution. + std::unique_ptr<CachedSolution> cachedSolution; + + // Query solutions resulting from planning the $or branch. + OwnedPointerVector<QuerySolution> solutions; }; + /** + * Plan each branch of the $or independently, and store the resulting + * lists of query solutions in '_solutions'. + * + * Called from SubplanStage::make so that construction of the subplan stage + * fails immediately, rather than returning a plan executor and subsequently + * through getNext(...). + */ + Status planSubqueries(); + + /** + * Uses the query planning results from planSubqueries() and the multi plan stage + * to select the best plan for each branch. + * + * Helper for pickBestPlan(). + */ + Status choosePlanForSubqueries(PlanYieldPolicy* yieldPolicy); + + /** + * Used as a fallback if subplanning fails. Helper for pickBestPlan(). + */ + Status choosePlanWholeQuery(PlanYieldPolicy* yieldPolicy); + + // transactional context for read locks. Not owned by us + OperationContext* _txn; + + // Not owned here. Must be non-null. + Collection* _collection; + + // Not owned here. + WorkingSet* _ws; + + QueryPlannerParams _plannerParams; + + // Not owned here. + CanonicalQuery* _query; + + // If we successfully create a "composite solution" by planning each $or branch + // independently, that solution is owned here. + std::unique_ptr<QuerySolution> _compositeSolution; + + std::unique_ptr<PlanStage> _child; + + // Holds a list of the results from planning each branch. + OwnedPointerVector<BranchPlanningResult> _branchResults; + + // We need this to extract cache-friendly index data from the index assignments. + std::map<BSONObj, size_t> _indexMap; + + CommonStats _commonStats; +}; + } // namespace mongo diff --git a/src/mongo/db/exec/text.cpp b/src/mongo/db/exec/text.cpp index f9db2e94be3..933e2b6aba2 100644 --- a/src/mongo/db/exec/text.cpp +++ b/src/mongo/db/exec/text.cpp @@ -41,54 +41,55 @@ namespace mongo { - using std::unique_ptr; - using std::string; - using std::vector; - - // static - const char* TextStage::kStageType = "TEXT"; - - TextStage::TextStage(OperationContext* txn, - const TextStageParams& params, - WorkingSet* ws, - const MatchExpression* filter) - : _txn(txn), - _params(params), - _ftsMatcher(params.query, params.spec), - _ws(ws), - _filter(filter), - _commonStats(kStageType), - _internalState(INIT_SCANS), - _currentIndexScanner(0), - _idRetrying(WorkingSet::INVALID_ID) { - _scoreIterator = _scores.end(); - _specificStats.indexPrefix = _params.indexPrefix; - _specificStats.indexName = _params.index->indexName(); +using std::unique_ptr; +using std::string; +using std::vector; + +// static +const char* TextStage::kStageType = "TEXT"; + +TextStage::TextStage(OperationContext* txn, + const TextStageParams& params, + WorkingSet* ws, + const MatchExpression* filter) + : _txn(txn), + _params(params), + _ftsMatcher(params.query, params.spec), + _ws(ws), + _filter(filter), + _commonStats(kStageType), + _internalState(INIT_SCANS), + _currentIndexScanner(0), + _idRetrying(WorkingSet::INVALID_ID) { + _scoreIterator = _scores.end(); + _specificStats.indexPrefix = _params.indexPrefix; + _specificStats.indexName = _params.index->indexName(); +} + +TextStage::~TextStage() {} + +bool TextStage::isEOF() { + return _internalState == DONE; +} + +PlanStage::StageState TextStage::work(WorkingSetID* out) { + ++_commonStats.works; + + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); + + if (isEOF()) { + return PlanStage::IS_EOF; } + invariant(_internalState != DONE); - TextStage::~TextStage() { } + PlanStage::StageState stageState = PlanStage::IS_EOF; - bool TextStage::isEOF() { - return _internalState == DONE; - } - - PlanStage::StageState TextStage::work(WorkingSetID* out) { - ++_commonStats.works; - - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); - - if (isEOF()) { return PlanStage::IS_EOF; } - invariant(_internalState != DONE); - - PlanStage::StageState stageState = PlanStage::IS_EOF; - - switch (_internalState) { + switch (_internalState) { case INIT_SCANS: try { stageState = initScans(out); - } - catch (const WriteConflictException& wce) { + } catch (const WriteConflictException& wce) { // Reset and try again next time. _internalState = INIT_SCANS; _scanners.clear(); @@ -106,10 +107,10 @@ namespace mongo { case DONE: // Handled above. break; - } + } - // Increment common stats counters that are specific to the return value of work(). - switch (stageState) { + // Increment common stats counters that are specific to the return value of work(). + switch (stageState) { case PlanStage::ADVANCED: ++_commonStats.advanced; break; @@ -121,376 +122,358 @@ namespace mongo { break; default: break; - } - - return stageState; } - void TextStage::saveState() { - _txn = NULL; - ++_commonStats.yields; + return stageState; +} - for (size_t i = 0; i < _scanners.size(); ++i) { - _scanners.mutableVector()[i]->saveState(); - } +void TextStage::saveState() { + _txn = NULL; + ++_commonStats.yields; - if (_recordCursor) _recordCursor->saveUnpositioned(); + for (size_t i = 0; i < _scanners.size(); ++i) { + _scanners.mutableVector()[i]->saveState(); } - void TextStage::restoreState(OperationContext* opCtx) { - invariant(_txn == NULL); - _txn = opCtx; - ++_commonStats.unyields; + if (_recordCursor) + _recordCursor->saveUnpositioned(); +} - for (size_t i = 0; i < _scanners.size(); ++i) { - _scanners.mutableVector()[i]->restoreState(opCtx); - } +void TextStage::restoreState(OperationContext* opCtx) { + invariant(_txn == NULL); + _txn = opCtx; + ++_commonStats.unyields; - if (_recordCursor) invariant(_recordCursor->restore(opCtx)); + for (size_t i = 0; i < _scanners.size(); ++i) { + _scanners.mutableVector()[i]->restoreState(opCtx); } - void TextStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { - ++_commonStats.invalidates; + if (_recordCursor) + invariant(_recordCursor->restore(opCtx)); +} - // Propagate invalidate to children. - for (size_t i = 0; i < _scanners.size(); ++i) { - _scanners.mutableVector()[i]->invalidate(txn, dl, type); - } +void TextStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + ++_commonStats.invalidates; - // We store the score keyed by RecordId. We have to toss out our state when the RecordId - // changes. - // TODO: If we're RETURNING_RESULTS we could somehow buffer the object. - ScoreMap::iterator scoreIt = _scores.find(dl); - if (scoreIt != _scores.end()) { - if (scoreIt == _scoreIterator) { - _scoreIterator++; - } - _scores.erase(scoreIt); - } + // Propagate invalidate to children. + for (size_t i = 0; i < _scanners.size(); ++i) { + _scanners.mutableVector()[i]->invalidate(txn, dl, type); } - vector<PlanStage*> TextStage::getChildren() const { - vector<PlanStage*> empty; - return empty; + // We store the score keyed by RecordId. We have to toss out our state when the RecordId + // changes. + // TODO: If we're RETURNING_RESULTS we could somehow buffer the object. + ScoreMap::iterator scoreIt = _scores.find(dl); + if (scoreIt != _scores.end()) { + if (scoreIt == _scoreIterator) { + _scoreIterator++; + } + _scores.erase(scoreIt); } +} - PlanStageStats* TextStage::getStats() { - _commonStats.isEOF = isEOF(); +vector<PlanStage*> TextStage::getChildren() const { + vector<PlanStage*> empty; + return empty; +} - // Add a BSON representation of the filter to the stats tree, if there is one. - if (NULL != _filter) { - BSONObjBuilder bob; - _filter->toBSON(&bob); - _commonStats.filter = bob.obj(); - } +PlanStageStats* TextStage::getStats() { + _commonStats.isEOF = isEOF(); - unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_TEXT)); - ret->specific.reset(new TextStats(_specificStats)); - return ret.release(); + // Add a BSON representation of the filter to the stats tree, if there is one. + if (NULL != _filter) { + BSONObjBuilder bob; + _filter->toBSON(&bob); + _commonStats.filter = bob.obj(); } - const CommonStats* TextStage::getCommonStats() const { - return &_commonStats; + unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_TEXT)); + ret->specific.reset(new TextStats(_specificStats)); + return ret.release(); +} + +const CommonStats* TextStage::getCommonStats() const { + return &_commonStats; +} + +const SpecificStats* TextStage::getSpecificStats() const { + return &_specificStats; +} + +PlanStage::StageState TextStage::initScans(WorkingSetID* out) { + invariant(0 == _scanners.size()); + + _recordCursor = _params.index->getCollection()->getCursor(_txn); + + _specificStats.parsedTextQuery = _params.query.toBSON(); + + // Get all the index scans for each term in our query. + // TODO it would be more efficient to only have one active scan at a time and create the + // next when each finishes. + for (std::set<std::string>::const_iterator it = _params.query.getTermsForBounds().begin(); + it != _params.query.getTermsForBounds().end(); + ++it) { + const string& term = *it; + IndexScanParams params; + params.bounds.startKey = FTSIndexFormat::getIndexKey( + MAX_WEIGHT, term, _params.indexPrefix, _params.spec.getTextIndexVersion()); + params.bounds.endKey = FTSIndexFormat::getIndexKey( + 0, term, _params.indexPrefix, _params.spec.getTextIndexVersion()); + params.bounds.endKeyInclusive = true; + params.bounds.isSimpleRange = true; + params.descriptor = _params.index; + params.direction = -1; + _scanners.mutableVector().push_back(new IndexScan(_txn, params, _ws, NULL)); } - const SpecificStats* TextStage::getSpecificStats() const { - return &_specificStats; + // If we have no terms we go right to EOF. + if (0 == _scanners.size()) { + _internalState = DONE; + return PlanStage::IS_EOF; } - PlanStage::StageState TextStage::initScans(WorkingSetID* out) { - invariant(0 == _scanners.size()); - - _recordCursor = _params.index->getCollection()->getCursor(_txn); - - _specificStats.parsedTextQuery = _params.query.toBSON(); - - // Get all the index scans for each term in our query. - // TODO it would be more efficient to only have one active scan at a time and create the - // next when each finishes. - for (std::set<std::string>::const_iterator it = _params.query.getTermsForBounds().begin(); - it != _params.query.getTermsForBounds().end(); - ++it) { - const string& term = *it; - IndexScanParams params; - params.bounds.startKey = FTSIndexFormat::getIndexKey(MAX_WEIGHT, - term, - _params.indexPrefix, - _params.spec.getTextIndexVersion()); - params.bounds.endKey = FTSIndexFormat::getIndexKey(0, - term, - _params.indexPrefix, - _params.spec.getTextIndexVersion()); - params.bounds.endKeyInclusive = true; - params.bounds.isSimpleRange = true; - params.descriptor = _params.index; - params.direction = -1; - _scanners.mutableVector().push_back(new IndexScan(_txn, params, _ws, NULL)); - } - - // If we have no terms we go right to EOF. - if (0 == _scanners.size()) { - _internalState = DONE; - return PlanStage::IS_EOF; - } - - // Transition to the next state. - _internalState = READING_TERMS; - return PlanStage::NEED_TIME; + // Transition to the next state. + _internalState = READING_TERMS; + return PlanStage::NEED_TIME; +} + +PlanStage::StageState TextStage::readFromSubScanners(WorkingSetID* out) { + // This should be checked before we get here. + invariant(_currentIndexScanner < _scanners.size()); + + // Either retry the last WSM we worked on or get a new one from our current scanner. + WorkingSetID id; + StageState childState; + if (_idRetrying == WorkingSet::INVALID_ID) { + childState = _scanners.vector()[_currentIndexScanner]->work(&id); + } else { + childState = ADVANCED; + id = _idRetrying; + _idRetrying = WorkingSet::INVALID_ID; } - PlanStage::StageState TextStage::readFromSubScanners(WorkingSetID* out) { - // This should be checked before we get here. - invariant(_currentIndexScanner < _scanners.size()); - - // Either retry the last WSM we worked on or get a new one from our current scanner. - WorkingSetID id; - StageState childState; - if (_idRetrying == WorkingSet::INVALID_ID) { - childState = _scanners.vector()[_currentIndexScanner]->work(&id); - } - else { - childState = ADVANCED; - id = _idRetrying; - _idRetrying = WorkingSet::INVALID_ID; - } + if (PlanStage::ADVANCED == childState) { + return addTerm(id, out); + } else if (PlanStage::IS_EOF == childState) { + // Done with this scan. + ++_currentIndexScanner; - if (PlanStage::ADVANCED == childState) { - return addTerm(id, out); + if (_currentIndexScanner < _scanners.size()) { + // We have another scan to read from. + return PlanStage::NEED_TIME; } - else if (PlanStage::IS_EOF == childState) { - // Done with this scan. - ++_currentIndexScanner; - if (_currentIndexScanner < _scanners.size()) { - // We have another scan to read from. - return PlanStage::NEED_TIME; - } + // If we're here we are done reading results. Move to the next state. + _scoreIterator = _scores.begin(); + _internalState = RETURNING_RESULTS; - // If we're here we are done reading results. Move to the next state. - _scoreIterator = _scores.begin(); - _internalState = RETURNING_RESULTS; - - // Don't need to keep these around. - _scanners.clear(); - return PlanStage::NEED_TIME; - } - else { - // Propagate WSID from below. - *out = id; - if (PlanStage::FAILURE == childState) { - // If a stage fails, it may create a status WSM to indicate why it - // failed, in which case 'id' is valid. If ID is invalid, we - // create our own error message. - if (WorkingSet::INVALID_ID == id) { - mongoutils::str::stream ss; - ss << "text stage failed to read in results from child"; - Status status(ErrorCodes::InternalError, ss); - *out = WorkingSetCommon::allocateStatusMember( _ws, status); - } + // Don't need to keep these around. + _scanners.clear(); + return PlanStage::NEED_TIME; + } else { + // Propagate WSID from below. + *out = id; + if (PlanStage::FAILURE == childState) { + // If a stage fails, it may create a status WSM to indicate why it + // failed, in which case 'id' is valid. If ID is invalid, we + // create our own error message. + if (WorkingSet::INVALID_ID == id) { + mongoutils::str::stream ss; + ss << "text stage failed to read in results from child"; + Status status(ErrorCodes::InternalError, ss); + *out = WorkingSetCommon::allocateStatusMember(_ws, status); } - return childState; } + return childState; } +} - PlanStage::StageState TextStage::returnResults(WorkingSetID* out) { - if (_scoreIterator == _scores.end()) { - _internalState = DONE; - return PlanStage::IS_EOF; - } - - // Filter for phrases and negative terms, score and truncate. - TextRecordData textRecordData = _scoreIterator->second; +PlanStage::StageState TextStage::returnResults(WorkingSetID* out) { + if (_scoreIterator == _scores.end()) { + _internalState = DONE; + return PlanStage::IS_EOF; + } - // Ignore non-matched documents. - if (textRecordData.score < 0) { - _scoreIterator++; - invariant(textRecordData.wsid == WorkingSet::INVALID_ID); - return PlanStage::NEED_TIME; - } - - WorkingSetMember* wsm = _ws->get(textRecordData.wsid); - try { - if (!WorkingSetCommon::fetchIfUnfetched(_txn, wsm, _recordCursor)) { - _scoreIterator++; - _ws->free(textRecordData.wsid); - _commonStats.needTime++; - return NEED_TIME; - } - } - catch (const WriteConflictException& wce) { - // Do this record again next time around. - *out = WorkingSet::INVALID_ID; - _commonStats.needYield++; - return NEED_YIELD; - } + // Filter for phrases and negative terms, score and truncate. + TextRecordData textRecordData = _scoreIterator->second; + // Ignore non-matched documents. + if (textRecordData.score < 0) { _scoreIterator++; + invariant(textRecordData.wsid == WorkingSet::INVALID_ID); + return PlanStage::NEED_TIME; + } - // Filter for phrases and negated terms - if (!_ftsMatcher.matches(wsm->obj.value())) { + WorkingSetMember* wsm = _ws->get(textRecordData.wsid); + try { + if (!WorkingSetCommon::fetchIfUnfetched(_txn, wsm, _recordCursor)) { + _scoreIterator++; _ws->free(textRecordData.wsid); - return PlanStage::NEED_TIME; + _commonStats.needTime++; + return NEED_TIME; } + } catch (const WriteConflictException& wce) { + // Do this record again next time around. + *out = WorkingSet::INVALID_ID; + _commonStats.needYield++; + return NEED_YIELD; + } - // Populate the working set member with the text score and return it. - wsm->addComputed(new TextScoreComputedData(textRecordData.score)); - *out = textRecordData.wsid; - return PlanStage::ADVANCED; + _scoreIterator++; + + // Filter for phrases and negated terms + if (!_ftsMatcher.matches(wsm->obj.value())) { + _ws->free(textRecordData.wsid); + return PlanStage::NEED_TIME; } - class TextMatchableDocument : public MatchableDocument { - public: - TextMatchableDocument(OperationContext* txn, - const BSONObj& keyPattern, - const BSONObj& key, - WorkingSetMember* wsm, - unowned_ptr<RecordCursor> recordCursor) - : _txn(txn), - _recordCursor(recordCursor), - _keyPattern(keyPattern), - _key(key), - _wsm(wsm) { } - - BSONObj toBSON() const { - return getObj(); - } + // Populate the working set member with the text score and return it. + wsm->addComputed(new TextScoreComputedData(textRecordData.score)); + *out = textRecordData.wsid; + return PlanStage::ADVANCED; +} + +class TextMatchableDocument : public MatchableDocument { +public: + TextMatchableDocument(OperationContext* txn, + const BSONObj& keyPattern, + const BSONObj& key, + WorkingSetMember* wsm, + unowned_ptr<RecordCursor> recordCursor) + : _txn(txn), _recordCursor(recordCursor), _keyPattern(keyPattern), _key(key), _wsm(wsm) {} + + BSONObj toBSON() const { + return getObj(); + } - virtual ElementIterator* allocateIterator(const ElementPath* path) const { - if (!_wsm->hasObj()) { - // Try to look in the key. - BSONObjIterator keyPatternIt(_keyPattern); - BSONObjIterator keyDataIt(_key); - - while (keyPatternIt.more()) { - BSONElement keyPatternElt = keyPatternIt.next(); - verify(keyDataIt.more()); - BSONElement keyDataElt = keyDataIt.next(); - - if (path->fieldRef().equalsDottedField(keyPatternElt.fieldName())) { - if (Array == keyDataElt.type()) { - return new SimpleArrayElementIterator(keyDataElt, true); - } - else { - return new SingleElementElementIterator(keyDataElt); - } + virtual ElementIterator* allocateIterator(const ElementPath* path) const { + if (!_wsm->hasObj()) { + // Try to look in the key. + BSONObjIterator keyPatternIt(_keyPattern); + BSONObjIterator keyDataIt(_key); + + while (keyPatternIt.more()) { + BSONElement keyPatternElt = keyPatternIt.next(); + verify(keyDataIt.more()); + BSONElement keyDataElt = keyDataIt.next(); + + if (path->fieldRef().equalsDottedField(keyPatternElt.fieldName())) { + if (Array == keyDataElt.type()) { + return new SimpleArrayElementIterator(keyDataElt, true); + } else { + return new SingleElementElementIterator(keyDataElt); } } } - - // Go to the raw document, fetching if needed. - return new BSONElementIterator(path, getObj()); } - virtual void releaseIterator( ElementIterator* iterator ) const { - delete iterator; - } + // Go to the raw document, fetching if needed. + return new BSONElementIterator(path, getObj()); + } - // Thrown if we detect that the document being matched was deleted. - class DocumentDeletedException {}; + virtual void releaseIterator(ElementIterator* iterator) const { + delete iterator; + } - private: - BSONObj getObj() const { - if (!WorkingSetCommon::fetchIfUnfetched(_txn, _wsm, _recordCursor)) - throw DocumentDeletedException(); + // Thrown if we detect that the document being matched was deleted. + class DocumentDeletedException {}; - // Make it owned since we are buffering results. - _wsm->obj.setValue(_wsm->obj.value().getOwned()); - return _wsm->obj.value(); - } +private: + BSONObj getObj() const { + if (!WorkingSetCommon::fetchIfUnfetched(_txn, _wsm, _recordCursor)) + throw DocumentDeletedException(); - OperationContext* _txn; - unowned_ptr<RecordCursor> _recordCursor; - BSONObj _keyPattern; - BSONObj _key; - WorkingSetMember* _wsm; - }; - - PlanStage::StageState TextStage::addTerm(WorkingSetID wsid, WorkingSetID* out) { - WorkingSetMember* wsm = _ws->get(wsid); - invariant(wsm->state == WorkingSetMember::LOC_AND_IDX); - invariant(1 == wsm->keyData.size()); - const IndexKeyDatum newKeyData = wsm->keyData.back(); // copy to keep it around. - - TextRecordData* textRecordData = &_scores[wsm->loc]; - double* documentAggregateScore = &textRecordData->score; - - if (WorkingSet::INVALID_ID == textRecordData->wsid) { - // We haven't seen this RecordId before. Keep the working set member around - // (it may be force-fetched on saveState()). - textRecordData->wsid = wsid; - - if (_filter) { - // We have not seen this document before and need to apply a filter. - bool shouldKeep; - bool wasDeleted = false; - try { - TextMatchableDocument tdoc(_txn, - newKeyData.indexKeyPattern, - newKeyData.keyData, - wsm, - _recordCursor); - shouldKeep = _filter->matches(&tdoc); - } - catch (const WriteConflictException& wce) { - _idRetrying = wsid; - *out = WorkingSet::INVALID_ID; - return NEED_YIELD; - } - catch (const TextMatchableDocument::DocumentDeletedException&) { - // We attempted to fetch the document but decided it should be excluded from the - // result set. - shouldKeep = false; - wasDeleted = true; - } + // Make it owned since we are buffering results. + _wsm->obj.setValue(_wsm->obj.value().getOwned()); + return _wsm->obj.value(); + } - if (!shouldKeep) { - if (wasDeleted || wsm->hasObj()) { - // We had to fetch but we're not going to return it. - ++_specificStats.fetches; - } - _ws->free(textRecordData->wsid); - textRecordData->wsid = WorkingSet::INVALID_ID; - *documentAggregateScore = -1; - return NEED_TIME; - } + OperationContext* _txn; + unowned_ptr<RecordCursor> _recordCursor; + BSONObj _keyPattern; + BSONObj _key; + WorkingSetMember* _wsm; +}; + +PlanStage::StageState TextStage::addTerm(WorkingSetID wsid, WorkingSetID* out) { + WorkingSetMember* wsm = _ws->get(wsid); + invariant(wsm->state == WorkingSetMember::LOC_AND_IDX); + invariant(1 == wsm->keyData.size()); + const IndexKeyDatum newKeyData = wsm->keyData.back(); // copy to keep it around. + + TextRecordData* textRecordData = &_scores[wsm->loc]; + double* documentAggregateScore = &textRecordData->score; + + if (WorkingSet::INVALID_ID == textRecordData->wsid) { + // We haven't seen this RecordId before. Keep the working set member around + // (it may be force-fetched on saveState()). + textRecordData->wsid = wsid; + + if (_filter) { + // We have not seen this document before and need to apply a filter. + bool shouldKeep; + bool wasDeleted = false; + try { + TextMatchableDocument tdoc( + _txn, newKeyData.indexKeyPattern, newKeyData.keyData, wsm, _recordCursor); + shouldKeep = _filter->matches(&tdoc); + } catch (const WriteConflictException& wce) { + _idRetrying = wsid; + *out = WorkingSet::INVALID_ID; + return NEED_YIELD; + } catch (const TextMatchableDocument::DocumentDeletedException&) { + // We attempted to fetch the document but decided it should be excluded from the + // result set. + shouldKeep = false; + wasDeleted = true; } - else { - // If we're here, we're going to return the doc, and we do a fetch later. - ++_specificStats.fetches; + + if (!shouldKeep) { + if (wasDeleted || wsm->hasObj()) { + // We had to fetch but we're not going to return it. + ++_specificStats.fetches; + } + _ws->free(textRecordData->wsid); + textRecordData->wsid = WorkingSet::INVALID_ID; + *documentAggregateScore = -1; + return NEED_TIME; } + } else { + // If we're here, we're going to return the doc, and we do a fetch later. + ++_specificStats.fetches; } - else { - // We already have a working set member for this RecordId. Free the new - // WSM and retrieve the old one. - // Note that since we don't keep all index keys, we could get a score that doesn't match - // the document, but this has always been a problem. - // TODO something to improve the situation. - invariant(wsid != textRecordData->wsid); - _ws->free(wsid); - wsm = _ws->get(textRecordData->wsid); - } + } else { + // We already have a working set member for this RecordId. Free the new + // WSM and retrieve the old one. + // Note that since we don't keep all index keys, we could get a score that doesn't match + // the document, but this has always been a problem. + // TODO something to improve the situation. + invariant(wsid != textRecordData->wsid); + _ws->free(wsid); + wsm = _ws->get(textRecordData->wsid); + } - ++_specificStats.keysExamined; + ++_specificStats.keysExamined; - if (*documentAggregateScore < 0) { - // We have already rejected this document for not matching the filter. - return NEED_TIME; - } + if (*documentAggregateScore < 0) { + // We have already rejected this document for not matching the filter. + return NEED_TIME; + } - // Locate score within possibly compound key: {prefix,term,score,suffix}. - BSONObjIterator keyIt(newKeyData.keyData); - for (unsigned i = 0; i < _params.spec.numExtraBefore(); i++) { - keyIt.next(); - } + // Locate score within possibly compound key: {prefix,term,score,suffix}. + BSONObjIterator keyIt(newKeyData.keyData); + for (unsigned i = 0; i < _params.spec.numExtraBefore(); i++) { + keyIt.next(); + } - keyIt.next(); // Skip past 'term'. + keyIt.next(); // Skip past 'term'. - BSONElement scoreElement = keyIt.next(); - double documentTermScore = scoreElement.number(); + BSONElement scoreElement = keyIt.next(); + double documentTermScore = scoreElement.number(); - // Aggregate relevance score, term keys. - *documentAggregateScore += documentTermScore; - return NEED_TIME; - } + // Aggregate relevance score, term keys. + *documentAggregateScore += documentTermScore; + return NEED_TIME; +} } // namespace mongo diff --git a/src/mongo/db/exec/text.h b/src/mongo/db/exec/text.h index 96f5c67bc4d..9e64621cbb7 100644 --- a/src/mongo/db/exec/text.h +++ b/src/mongo/db/exec/text.h @@ -46,157 +46,159 @@ namespace mongo { - using fts::FTSIndexFormat; - using fts::FTSMatcher; - using fts::FTSQuery; - using fts::FTSSpec; - using fts::MAX_WEIGHT; +using fts::FTSIndexFormat; +using fts::FTSMatcher; +using fts::FTSQuery; +using fts::FTSSpec; +using fts::MAX_WEIGHT; - class OperationContext; +class OperationContext; - struct TextStageParams { - TextStageParams(const FTSSpec& s) : spec(s) {} +struct TextStageParams { + TextStageParams(const FTSSpec& s) : spec(s) {} - // Text index descriptor. IndexCatalog owns this. - IndexDescriptor* index; + // Text index descriptor. IndexCatalog owns this. + IndexDescriptor* index; - // Index spec. - FTSSpec spec; + // Index spec. + FTSSpec spec; - // Index keys that precede the "text" index key. - BSONObj indexPrefix; + // Index keys that precede the "text" index key. + BSONObj indexPrefix; - // The text query. - FTSQuery query; - }; + // The text query. + FTSQuery query; +}; +/** + * Implements a blocking stage that returns text search results. + * + * Prerequisites: None; is a leaf node. + * Output type: LOC_AND_OBJ_UNOWNED. + * + * TODO: Should the TextStage ever generate NEED_YIELD requests for fetching MMAP v1 records? + * Right now this stage could reduce concurrency by failing to request a yield during fetch. + */ +class TextStage : public PlanStage { +public: /** - * Implements a blocking stage that returns text search results. - * - * Prerequisites: None; is a leaf node. - * Output type: LOC_AND_OBJ_UNOWNED. - * - * TODO: Should the TextStage ever generate NEED_YIELD requests for fetching MMAP v1 records? - * Right now this stage could reduce concurrency by failing to request a yield during fetch. + * The text stage has a few 'states' it transitions between. */ - class TextStage : public PlanStage { - public: - /** - * The text stage has a few 'states' it transitions between. - */ - enum State { - // 1. Initialize the index scans we use to retrieve term/score info. - INIT_SCANS, + enum State { + // 1. Initialize the index scans we use to retrieve term/score info. + INIT_SCANS, - // 2. Read the terms/scores from the text index. - READING_TERMS, + // 2. Read the terms/scores from the text index. + READING_TERMS, - // 3. Return results to our parent. - RETURNING_RESULTS, + // 3. Return results to our parent. + RETURNING_RESULTS, - // 4. Done. - DONE, - }; + // 4. Done. + DONE, + }; - TextStage(OperationContext* txn, - const TextStageParams& params, - WorkingSet* ws, - const MatchExpression* filter); + TextStage(OperationContext* txn, + const TextStageParams& params, + WorkingSet* ws, + const MatchExpression* filter); - virtual ~TextStage(); + virtual ~TextStage(); - virtual StageState work(WorkingSetID* out); - virtual bool isEOF(); + virtual StageState work(WorkingSetID* out); + virtual bool isEOF(); - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - virtual std::vector<PlanStage*> getChildren() const; + virtual std::vector<PlanStage*> getChildren() const; - virtual StageType stageType() const { return STAGE_TEXT; } + virtual StageType stageType() const { + return STAGE_TEXT; + } - PlanStageStats* getStats(); + PlanStageStats* getStats(); - virtual const CommonStats* getCommonStats() const; + virtual const CommonStats* getCommonStats() const; - virtual const SpecificStats* getSpecificStats() const; + virtual const SpecificStats* getSpecificStats() const; - static const char* kStageType; + static const char* kStageType; - private: - /** - * Initializes sub-scanners. - */ - StageState initScans(WorkingSetID* out); +private: + /** + * Initializes sub-scanners. + */ + StageState initScans(WorkingSetID* out); - /** - * Helper for buffering results array. Returns NEED_TIME (if any results were produced), - * IS_EOF, or FAILURE. - */ - StageState readFromSubScanners(WorkingSetID* out); + /** + * Helper for buffering results array. Returns NEED_TIME (if any results were produced), + * IS_EOF, or FAILURE. + */ + StageState readFromSubScanners(WorkingSetID* out); - /** - * Helper called from readFromSubScanners to update aggregate score with a new-found (term, - * score) pair for this document. Also rejects documents that don't match this stage's - * filter. - */ - StageState addTerm(WorkingSetID wsid, WorkingSetID* out); + /** + * Helper called from readFromSubScanners to update aggregate score with a new-found (term, + * score) pair for this document. Also rejects documents that don't match this stage's + * filter. + */ + StageState addTerm(WorkingSetID wsid, WorkingSetID* out); - /** - * Possibly return a result. FYI, this may perform a fetch directly if it is needed to - * evaluate all filters. - */ - StageState returnResults(WorkingSetID* out); + /** + * Possibly return a result. FYI, this may perform a fetch directly if it is needed to + * evaluate all filters. + */ + StageState returnResults(WorkingSetID* out); - // transactional context for read locks. Not owned by us - OperationContext* _txn; + // transactional context for read locks. Not owned by us + OperationContext* _txn; - // Parameters of this text stage. - TextStageParams _params; + // Parameters of this text stage. + TextStageParams _params; - // Text-specific phrase and negated term matcher. - FTSMatcher _ftsMatcher; + // Text-specific phrase and negated term matcher. + FTSMatcher _ftsMatcher; - // Working set. Not owned by us. - WorkingSet* _ws; + // Working set. Not owned by us. + WorkingSet* _ws; - // Filter. Not owned by us. - const MatchExpression* _filter; + // Filter. Not owned by us. + const MatchExpression* _filter; - // Stats. - CommonStats _commonStats; - TextStats _specificStats; + // Stats. + CommonStats _commonStats; + TextStats _specificStats; - // What state are we in? See the State enum above. - State _internalState; + // What state are we in? See the State enum above. + State _internalState; - // Used in INIT_SCANS and READING_TERMS. The index scans we're using to retrieve text - // terms. - OwnedPointerVector<PlanStage> _scanners; + // Used in INIT_SCANS and READING_TERMS. The index scans we're using to retrieve text + // terms. + OwnedPointerVector<PlanStage> _scanners; - // Which _scanners are we currently reading from? - size_t _currentIndexScanner; + // Which _scanners are we currently reading from? + size_t _currentIndexScanner; - // If not Null, we use this rather than asking our child what to do next. - WorkingSetID _idRetrying; + // If not Null, we use this rather than asking our child what to do next. + WorkingSetID _idRetrying; - // Map each buffered record id to this data. - struct TextRecordData { - TextRecordData() : wsid(WorkingSet::INVALID_ID), score(0.0) { } - WorkingSetID wsid; - double score; - }; + // Map each buffered record id to this data. + struct TextRecordData { + TextRecordData() : wsid(WorkingSet::INVALID_ID), score(0.0) {} + WorkingSetID wsid; + double score; + }; - // Temporary score data filled out by sub-scans. Used in READING_TERMS and - // RETURNING_RESULTS. - // Maps from diskloc -> (aggregate score for doc, wsid). - typedef unordered_map<RecordId, TextRecordData, RecordId::Hasher> ScoreMap; - ScoreMap _scores; - ScoreMap::const_iterator _scoreIterator; + // Temporary score data filled out by sub-scans. Used in READING_TERMS and + // RETURNING_RESULTS. + // Maps from diskloc -> (aggregate score for doc, wsid). + typedef unordered_map<RecordId, TextRecordData, RecordId::Hasher> ScoreMap; + ScoreMap _scores; + ScoreMap::const_iterator _scoreIterator; - // Used for fetching records from the collection. - std::unique_ptr<RecordCursor> _recordCursor; - }; + // Used for fetching records from the collection. + std::unique_ptr<RecordCursor> _recordCursor; +}; -} // namespace mongo +} // namespace mongo diff --git a/src/mongo/db/exec/update.cpp b/src/mongo/db/exec/update.cpp index 8ba566cd7e0..7d2a768664f 100644 --- a/src/mongo/db/exec/update.cpp +++ b/src/mongo/db/exec/update.cpp @@ -46,1070 +46,1025 @@ namespace mongo { - using std::unique_ptr; - using std::string; - using std::vector; +using std::unique_ptr; +using std::string; +using std::vector; - namespace mb = mutablebson; +namespace mb = mutablebson; - namespace { +namespace { - const char idFieldName[] = "_id"; - const FieldRef idFieldRef(idFieldName); +const char idFieldName[] = "_id"; +const FieldRef idFieldRef(idFieldName); - Status storageValid(const mb::Document&, const bool = true); - Status storageValid(const mb::ConstElement&, const bool = true); - Status storageValidChildren(const mb::ConstElement&, const bool = true); +Status storageValid(const mb::Document&, const bool = true); +Status storageValid(const mb::ConstElement&, const bool = true); +Status storageValidChildren(const mb::ConstElement&, const bool = true); - /** - * mutable::document storageValid check -- like BSONObj::_okForStorage - */ - Status storageValid(const mb::Document& doc, const bool deep) { - mb::ConstElement currElem = doc.root().leftChild(); - while (currElem.ok()) { - if (currElem.getFieldName() == idFieldName) { - switch (currElem.getType()) { - case RegEx: - case Array: - case Undefined: - return Status(ErrorCodes::InvalidIdField, - str::stream() << "The '_id' value cannot be of type " - << typeName(currElem.getType())); - default: - break; - } - } - Status s = storageValid(currElem, deep); - if (!s.isOK()) - return s; - currElem = currElem.rightSibling(); +/** + * mutable::document storageValid check -- like BSONObj::_okForStorage + */ +Status storageValid(const mb::Document& doc, const bool deep) { + mb::ConstElement currElem = doc.root().leftChild(); + while (currElem.ok()) { + if (currElem.getFieldName() == idFieldName) { + switch (currElem.getType()) { + case RegEx: + case Array: + case Undefined: + return Status(ErrorCodes::InvalidIdField, + str::stream() << "The '_id' value cannot be of type " + << typeName(currElem.getType())); + default: + break; } + } + Status s = storageValid(currElem, deep); + if (!s.isOK()) + return s; + currElem = currElem.rightSibling(); + } + + return Status::OK(); +} - return Status::OK(); +/** + * Validates an element that has a field name which starts with a dollar sign ($). + * In the case of a DBRef field ($id, $ref, [$db]) these fields may be valid in + * the correct order/context only. + */ +Status validateDollarPrefixElement(const mb::ConstElement elem, const bool deep) { + mb::ConstElement curr = elem; + StringData currName = elem.getFieldName(); + LOG(5) << "validateDollarPrefixElement -- validating field '" << currName << "'"; + // Found a $db field + if (currName == "$db") { + if (curr.getType() != String) { + return Status(ErrorCodes::InvalidDBRef, + str::stream() << "The DBRef $db field must be a String, not a " + << typeName(curr.getType())); } + curr = curr.leftSibling(); - /** - * Validates an element that has a field name which starts with a dollar sign ($). - * In the case of a DBRef field ($id, $ref, [$db]) these fields may be valid in - * the correct order/context only. - */ - Status validateDollarPrefixElement(const mb::ConstElement elem, const bool deep) { - mb::ConstElement curr = elem; - StringData currName = elem.getFieldName(); - LOG(5) << "validateDollarPrefixElement -- validating field '" << currName << "'"; - // Found a $db field - if (currName == "$db") { - if (curr.getType() != String) { - return Status(ErrorCodes::InvalidDBRef, - str::stream() << "The DBRef $db field must be a String, not a " - << typeName(curr.getType())); - } - curr = curr.leftSibling(); + if (!curr.ok() || (curr.getFieldName() != "$id")) + return Status(ErrorCodes::InvalidDBRef, + "Found $db field without a $id before it, which is invalid."); - if (!curr.ok() || (curr.getFieldName() != "$id")) - return Status(ErrorCodes::InvalidDBRef, - "Found $db field without a $id before it, which is invalid."); + currName = curr.getFieldName(); + } - currName = curr.getFieldName(); - } + // Found a $id field + if (currName == "$id") { + Status s = storageValidChildren(curr, deep); + if (!s.isOK()) + return s; - // Found a $id field - if (currName == "$id") { - Status s = storageValidChildren(curr, deep); - if (!s.isOK()) - return s; + curr = curr.leftSibling(); + if (!curr.ok() || (curr.getFieldName() != "$ref")) { + return Status(ErrorCodes::InvalidDBRef, + "Found $id field without a $ref before it, which is invalid."); + } - curr = curr.leftSibling(); - if (!curr.ok() || (curr.getFieldName() != "$ref")) { - return Status(ErrorCodes::InvalidDBRef, - "Found $id field without a $ref before it, which is invalid."); - } + currName = curr.getFieldName(); + } - currName = curr.getFieldName(); - } + if (currName == "$ref") { + if (curr.getType() != String) { + return Status(ErrorCodes::InvalidDBRef, + str::stream() << "The DBRef $ref field must be a String, not a " + << typeName(curr.getType())); + } - if (currName == "$ref") { - if (curr.getType() != String) { - return Status(ErrorCodes::InvalidDBRef, - str::stream() << "The DBRef $ref field must be a String, not a " - << typeName(curr.getType())); - } + if (!curr.rightSibling().ok() || curr.rightSibling().getFieldName() != "$id") + return Status(ErrorCodes::InvalidDBRef, + str::stream() << "The DBRef $ref field must be " + "following by a $id field"); + } else { + // not an okay, $ prefixed field name. + return Status(ErrorCodes::DollarPrefixedFieldName, + str::stream() << "The dollar ($) prefixed field '" << elem.getFieldName() + << "' in '" << mb::getFullName(elem) + << "' is not valid for storage."); + } - if (!curr.rightSibling().ok() || curr.rightSibling().getFieldName() != "$id") - return Status(ErrorCodes::InvalidDBRef, - str::stream() << "The DBRef $ref field must be " - "following by a $id field"); - } - else { - // not an okay, $ prefixed field name. - return Status(ErrorCodes::DollarPrefixedFieldName, - str::stream() << "The dollar ($) prefixed field '" - << elem.getFieldName() << "' in '" - << mb::getFullName(elem) - << "' is not valid for storage."); + return Status::OK(); +} +/** + * Checks that all parents, of the element passed in, are valid for storage + * + * Note: The elem argument must be in a valid state when using this function + */ +Status storageValidParents(const mb::ConstElement& elem) { + const mb::ConstElement& root = elem.getDocument().root(); + if (elem != root) { + const mb::ConstElement& parent = elem.parent(); + if (parent.ok() && parent != root) { + Status s = storageValid(parent, false); + if (s.isOK()) { + s = storageValidParents(parent); } - return Status::OK(); + return s; + } + } + return Status::OK(); +} + +Status storageValid(const mb::ConstElement& elem, const bool deep) { + if (!elem.ok()) + return Status(ErrorCodes::BadValue, "Invalid elements cannot be stored."); + + // Field names of elements inside arrays are not meaningful in mutable bson, + // so we do not want to validate them. + // + // TODO: Revisit how mutable handles array field names. We going to need to make + // this better if we ever want to support ordered updates that can alter the same + // element repeatedly; see SERVER-12848. + const mb::ConstElement& parent = elem.parent(); + const bool childOfArray = parent.ok() ? (parent.getType() == mongo::Array) : false; + + if (!childOfArray) { + StringData fieldName = elem.getFieldName(); + // Cannot start with "$", unless dbref + if (fieldName[0] == '$') { + Status status = validateDollarPrefixElement(elem, deep); + if (!status.isOK()) + return status; + } else if (fieldName.find(".") != string::npos) { + // Field name cannot have a "." in it. + return Status(ErrorCodes::DottedFieldName, + str::stream() << "The dotted field '" << elem.getFieldName() << "' in '" + << mb::getFullName(elem) << "' is not valid for storage."); } + } - /** - * Checks that all parents, of the element passed in, are valid for storage - * - * Note: The elem argument must be in a valid state when using this function - */ - Status storageValidParents(const mb::ConstElement& elem) { - const mb::ConstElement& root = elem.getDocument().root(); - if (elem != root) { - const mb::ConstElement& parent = elem.parent(); - if (parent.ok() && parent != root) { - Status s = storageValid(parent, false); - if (s.isOK()) { - s = storageValidParents(parent); - } + if (deep) { + // Check children if there are any. + Status s = storageValidChildren(elem, deep); + if (!s.isOK()) + return s; + } - return s; - } - } - return Status::OK(); - } + return Status::OK(); +} - Status storageValid(const mb::ConstElement& elem, const bool deep) { - if (!elem.ok()) - return Status(ErrorCodes::BadValue, "Invalid elements cannot be stored."); - - // Field names of elements inside arrays are not meaningful in mutable bson, - // so we do not want to validate them. - // - // TODO: Revisit how mutable handles array field names. We going to need to make - // this better if we ever want to support ordered updates that can alter the same - // element repeatedly; see SERVER-12848. - const mb::ConstElement& parent = elem.parent(); - const bool childOfArray = parent.ok() ? (parent.getType() == mongo::Array) : false; - - if (!childOfArray) { - StringData fieldName = elem.getFieldName(); - // Cannot start with "$", unless dbref - if (fieldName[0] == '$') { - Status status = validateDollarPrefixElement(elem, deep); - if (!status.isOK()) - return status; - } - else if (fieldName.find(".") != string::npos) { - // Field name cannot have a "." in it. - return Status(ErrorCodes::DottedFieldName, - str::stream() << "The dotted field '" - << elem.getFieldName() << "' in '" - << mb::getFullName(elem) - << "' is not valid for storage."); - } - } +Status storageValidChildren(const mb::ConstElement& elem, const bool deep) { + if (!elem.hasChildren()) + return Status::OK(); - if (deep) { - // Check children if there are any. - Status s = storageValidChildren(elem, deep); - if (!s.isOK()) - return s; - } + mb::ConstElement curr = elem.leftChild(); + while (curr.ok()) { + Status s = storageValid(curr, deep); + if (!s.isOK()) + return s; + curr = curr.rightSibling(); + } + + return Status::OK(); +} - return Status::OK(); +/** + * This will verify that all updated fields are + * 1.) Valid for storage (checking parent to make sure things like DBRefs are valid) + * 2.) Compare updated immutable fields do not change values + * + * If updateFields is empty then it was replacement and/or we need to check all fields + */ +inline Status validate(const BSONObj& original, + const FieldRefSet& updatedFields, + const mb::Document& updated, + const std::vector<FieldRef*>* immutableAndSingleValueFields, + const ModifierInterface::Options& opts) { + LOG(3) << "update validate options -- " + << " updatedFields: " << updatedFields << " immutableAndSingleValueFields.size:" + << (immutableAndSingleValueFields ? immutableAndSingleValueFields->size() : 0) + << " validate:" << opts.enforceOkForStorage; + + // 1.) Loop through each updated field and validate for storage + // and detect immutable field updates + + // The set of possibly changed immutable fields -- we will need to check their vals + FieldRefSet changedImmutableFields; + + // Check to see if there were no fields specified or if we are not validating + // The case if a range query, or query that didn't result in saved fields + if (updatedFields.empty() || !opts.enforceOkForStorage) { + if (opts.enforceOkForStorage) { + // No specific fields were updated so the whole doc must be checked + Status s = storageValid(updated, true); + if (!s.isOK()) + return s; } - Status storageValidChildren(const mb::ConstElement& elem, const bool deep) { - if (!elem.hasChildren()) - return Status::OK(); + // Check all immutable fields + if (immutableAndSingleValueFields) + changedImmutableFields.fillFrom(*immutableAndSingleValueFields); + } else { + // TODO: Change impl so we don't need to create a new FieldRefSet + // -- move all conflict logic into static function on FieldRefSet? + FieldRefSet immutableFieldRef; + if (immutableAndSingleValueFields) + immutableFieldRef.fillFrom(*immutableAndSingleValueFields); + + FieldRefSet::const_iterator where = updatedFields.begin(); + const FieldRefSet::const_iterator end = updatedFields.end(); + for (; where != end; ++where) { + const FieldRef& current = **where; + + // Find the updated field in the updated document. + mutablebson::ConstElement newElem = updated.root(); + size_t currentPart = 0; + while (newElem.ok() && currentPart < current.numParts()) + newElem = newElem[current.getPart(currentPart++)]; + + // newElem might be missing if $unset/$renamed-away + if (newElem.ok()) { + // Check element, and its children + Status s = storageValid(newElem, true); + if (!s.isOK()) + return s; - mb::ConstElement curr = elem.leftChild(); - while (curr.ok()) { - Status s = storageValid(curr, deep); + // Check parents to make sure they are valid as well. + s = storageValidParents(newElem); if (!s.isOK()) return s; - curr = curr.rightSibling(); } - - return Status::OK(); + // Check if the updated field conflicts with immutable fields + immutableFieldRef.findConflicts(¤t, &changedImmutableFields); } + } - /** - * This will verify that all updated fields are - * 1.) Valid for storage (checking parent to make sure things like DBRefs are valid) - * 2.) Compare updated immutable fields do not change values - * - * If updateFields is empty then it was replacement and/or we need to check all fields - */ - inline Status validate(const BSONObj& original, - const FieldRefSet& updatedFields, - const mb::Document& updated, - const std::vector<FieldRef*>* immutableAndSingleValueFields, - const ModifierInterface::Options& opts) { - - LOG(3) << "update validate options -- " - << " updatedFields: " << updatedFields - << " immutableAndSingleValueFields.size:" - << (immutableAndSingleValueFields ? immutableAndSingleValueFields->size() : 0) - << " validate:" << opts.enforceOkForStorage; - - // 1.) Loop through each updated field and validate for storage - // and detect immutable field updates - - // The set of possibly changed immutable fields -- we will need to check their vals - FieldRefSet changedImmutableFields; - - // Check to see if there were no fields specified or if we are not validating - // The case if a range query, or query that didn't result in saved fields - if (updatedFields.empty() || !opts.enforceOkForStorage) { - if (opts.enforceOkForStorage) { - // No specific fields were updated so the whole doc must be checked - Status s = storageValid(updated, true); - if (!s.isOK()) - return s; - } - - // Check all immutable fields - if (immutableAndSingleValueFields) - changedImmutableFields.fillFrom(*immutableAndSingleValueFields); - } - else { - - // TODO: Change impl so we don't need to create a new FieldRefSet - // -- move all conflict logic into static function on FieldRefSet? - FieldRefSet immutableFieldRef; - if (immutableAndSingleValueFields) - immutableFieldRef.fillFrom(*immutableAndSingleValueFields); - - FieldRefSet::const_iterator where = updatedFields.begin(); - const FieldRefSet::const_iterator end = updatedFields.end(); - for( ; where != end; ++where) { - const FieldRef& current = **where; - - // Find the updated field in the updated document. - mutablebson::ConstElement newElem = updated.root(); - size_t currentPart = 0; - while (newElem.ok() && currentPart < current.numParts()) - newElem = newElem[current.getPart(currentPart++)]; - - // newElem might be missing if $unset/$renamed-away - if (newElem.ok()) { - - // Check element, and its children - Status s = storageValid(newElem, true); - if (!s.isOK()) - return s; - - // Check parents to make sure they are valid as well. - s = storageValidParents(newElem); - if (!s.isOK()) - return s; - - } - // Check if the updated field conflicts with immutable fields - immutableFieldRef.findConflicts(¤t, &changedImmutableFields); - } - } + const bool checkIdField = (updatedFields.empty() && !original.isEmpty()) || + updatedFields.findConflicts(&idFieldRef, NULL); - const bool checkIdField = (updatedFields.empty() && !original.isEmpty()) || - updatedFields.findConflicts(&idFieldRef, NULL); + // Add _id to fields to check since it too is immutable + if (checkIdField) + changedImmutableFields.keepShortest(&idFieldRef); + else if (changedImmutableFields.empty()) { + // Return early if nothing changed which is immutable + return Status::OK(); + } - // Add _id to fields to check since it too is immutable - if (checkIdField) - changedImmutableFields.keepShortest(&idFieldRef); - else if (changedImmutableFields.empty()) { - // Return early if nothing changed which is immutable - return Status::OK(); + LOG(4) << "Changed immutable fields: " << changedImmutableFields; + // 2.) Now compare values of the changed immutable fields (to make sure they haven't) + + const mutablebson::ConstElement newIdElem = updated.root()[idFieldName]; + + FieldRefSet::const_iterator where = changedImmutableFields.begin(); + const FieldRefSet::const_iterator end = changedImmutableFields.end(); + for (; where != end; ++where) { + const FieldRef& current = **where; + + // Find the updated field in the updated document. + mutablebson::ConstElement newElem = updated.root(); + size_t currentPart = 0; + while (newElem.ok() && currentPart < current.numParts()) + newElem = newElem[current.getPart(currentPart++)]; + + if (!newElem.ok()) { + if (original.isEmpty()) { + // If the _id is missing and not required, then skip this check + if (!(current.dottedField() == idFieldName)) + return Status(ErrorCodes::NoSuchKey, + mongoutils::str::stream() << "After applying the update, the new" + << " document was missing the '" + << current.dottedField() + << "' (required and immutable) field."); + + } else { + if (current.dottedField() != idFieldName) + return Status(ErrorCodes::ImmutableField, + mongoutils::str::stream() + << "After applying the update to the document with " + << newIdElem.toString() << ", the '" << current.dottedField() + << "' (required and immutable) field was " + "found to have been removed --" << original); } + } else { + // Find the potentially affected field in the original document. + const BSONElement oldElem = original.getFieldDotted(current.dottedField()); + const BSONElement oldIdElem = original.getField(idFieldName); - LOG(4) << "Changed immutable fields: " << changedImmutableFields; - // 2.) Now compare values of the changed immutable fields (to make sure they haven't) - - const mutablebson::ConstElement newIdElem = updated.root()[idFieldName]; - - FieldRefSet::const_iterator where = changedImmutableFields.begin(); - const FieldRefSet::const_iterator end = changedImmutableFields.end(); - for( ; where != end; ++where ) { - const FieldRef& current = **where; - - // Find the updated field in the updated document. - mutablebson::ConstElement newElem = updated.root(); - size_t currentPart = 0; - while (newElem.ok() && currentPart < current.numParts()) - newElem = newElem[current.getPart(currentPart++)]; - - if (!newElem.ok()) { - if (original.isEmpty()) { - // If the _id is missing and not required, then skip this check - if (!(current.dottedField() == idFieldName)) - return Status(ErrorCodes::NoSuchKey, - mongoutils::str::stream() - << "After applying the update, the new" - << " document was missing the '" - << current.dottedField() - << "' (required and immutable) field."); - - } - else { - if (current.dottedField() != idFieldName) - return Status(ErrorCodes::ImmutableField, - mongoutils::str::stream() - << "After applying the update to the document with " - << newIdElem.toString() - << ", the '" << current.dottedField() - << "' (required and immutable) field was " - "found to have been removed --" - << original); - } - } - else { - - // Find the potentially affected field in the original document. - const BSONElement oldElem = original.getFieldDotted(current.dottedField()); - const BSONElement oldIdElem = original.getField(idFieldName); - - // Ensure no arrays since neither _id nor shard keys can be in an array, or one. - mb::ConstElement currElem = newElem; - while (currElem.ok()) { - if (currElem.getType() == Array) { - return Status(ErrorCodes::NotSingleValueField, - mongoutils::str::stream() - << "After applying the update to the document {" - << (oldIdElem.ok() ? oldIdElem.toString() : - newIdElem.toString()) - << " , ...}, the (immutable) field '" - << current.dottedField() - << "' was found to be an array or array descendant."); - } - currElem = currElem.parent(); - } - - // If we have both (old and new), compare them. If we just have new we are good - if (oldElem.ok() && newElem.compareWithBSONElement(oldElem, false) != 0) { - return Status(ErrorCodes::ImmutableField, - mongoutils::str::stream() - << "After applying the update to the document {" - << oldElem.toString() - << " , ...}, the (immutable) field '" << current.dottedField() - << "' was found to have been altered to " - << newElem.toString()); - } + // Ensure no arrays since neither _id nor shard keys can be in an array, or one. + mb::ConstElement currElem = newElem; + while (currElem.ok()) { + if (currElem.getType() == Array) { + return Status( + ErrorCodes::NotSingleValueField, + mongoutils::str::stream() + << "After applying the update to the document {" + << (oldIdElem.ok() ? oldIdElem.toString() : newIdElem.toString()) + << " , ...}, the (immutable) field '" << current.dottedField() + << "' was found to be an array or array descendant."); } + currElem = currElem.parent(); } - return Status::OK(); - } - - Status ensureIdAndFirst(mb::Document& doc) { - mb::Element idElem = mb::findFirstChildNamed(doc.root(), idFieldName); - - // Move _id as first element if it exists - if (idElem.ok()) { - if (idElem.leftSibling().ok()) { - Status s = idElem.remove(); - if (!s.isOK()) - return s; - s = doc.root().pushFront(idElem); - if (!s.isOK()) - return s; - } - } - else { - // Create _id if the document does not currently have one. - idElem = doc.makeElementNewOID(idFieldName); - if (!idElem.ok()) - return Status(ErrorCodes::BadValue, - "Could not create new _id ObjectId element.", - 17268); - Status s = doc.root().pushFront(idElem); - if (!s.isOK()) - return s; + // If we have both (old and new), compare them. If we just have new we are good + if (oldElem.ok() && newElem.compareWithBSONElement(oldElem, false) != 0) { + return Status(ErrorCodes::ImmutableField, + mongoutils::str::stream() + << "After applying the update to the document {" + << oldElem.toString() << " , ...}, the (immutable) field '" + << current.dottedField() << "' was found to have been altered to " + << newElem.toString()); } - - return Status::OK(); } - - } // namespace - - // static - const char* UpdateStage::kStageType = "UPDATE"; - - UpdateStage::UpdateStage(OperationContext* txn, - const UpdateStageParams& params, - WorkingSet* ws, - Collection* collection, - PlanStage* child) - : _txn(txn), - _params(params), - _ws(ws), - _collection(collection), - _child(child), - _idRetrying(WorkingSet::INVALID_ID), - _idReturning(WorkingSet::INVALID_ID), - _commonStats(kStageType), - _updatedLocs(params.request->isMulti() ? new DiskLocSet() : NULL), - _doc(params.driver->getDocument()) { - // We are an update until we fall into the insert case. - params.driver->setContext(ModifierInterface::ExecInfo::UPDATE_CONTEXT); - - // Before we even start executing, we know whether or not this is a replacement - // style or $mod style update. - _specificStats.isDocReplacement = params.driver->isDocReplacement(); } - BSONObj UpdateStage::transformAndUpdate(const Snapshotted<BSONObj>& oldObj, RecordId& loc) { - const UpdateRequest* request = _params.request; - UpdateDriver* driver = _params.driver; - CanonicalQuery* cq = _params.canonicalQuery; - UpdateLifecycle* lifecycle = request->getLifecycle(); - - // If asked to return new doc, default to the oldObj, in case nothing changes. - BSONObj newObj = oldObj.value(); - - // Ask the driver to apply the mods. It may be that the driver can apply those "in - // place", that is, some values of the old document just get adjusted without any - // change to the binary layout on the bson layer. It may be that a whole new document - // is needed to accomodate the new bson layout of the resulting document. In any event, - // only enable in-place mutations if the underlying storage engine offers support for - // writing damage events. - _doc.reset(oldObj.value(), - (_collection->updateWithDamagesSupported() ? - mutablebson::Document::kInPlaceEnabled : - mutablebson::Document::kInPlaceDisabled)); - - BSONObj logObj; - - FieldRefSet updatedFields; - bool docWasModified = false; - - Status status = Status::OK(); - if (!driver->needMatchDetails()) { - // If we don't need match details, avoid doing the rematch - status = driver->update(StringData(), &_doc, &logObj, &updatedFields, &docWasModified); + return Status::OK(); +} + +Status ensureIdAndFirst(mb::Document& doc) { + mb::Element idElem = mb::findFirstChildNamed(doc.root(), idFieldName); + + // Move _id as first element if it exists + if (idElem.ok()) { + if (idElem.leftSibling().ok()) { + Status s = idElem.remove(); + if (!s.isOK()) + return s; + s = doc.root().pushFront(idElem); + if (!s.isOK()) + return s; } - else { - // If there was a matched field, obtain it. - MatchDetails matchDetails; - matchDetails.requestElemMatchKey(); + } else { + // Create _id if the document does not currently have one. + idElem = doc.makeElementNewOID(idFieldName); + if (!idElem.ok()) + return Status( + ErrorCodes::BadValue, "Could not create new _id ObjectId element.", 17268); + Status s = doc.root().pushFront(idElem); + if (!s.isOK()) + return s; + } - dassert(cq); - verify(cq->root()->matchesBSON(oldObj.value(), &matchDetails)); + return Status::OK(); +} + +} // namespace + +// static +const char* UpdateStage::kStageType = "UPDATE"; + +UpdateStage::UpdateStage(OperationContext* txn, + const UpdateStageParams& params, + WorkingSet* ws, + Collection* collection, + PlanStage* child) + : _txn(txn), + _params(params), + _ws(ws), + _collection(collection), + _child(child), + _idRetrying(WorkingSet::INVALID_ID), + _idReturning(WorkingSet::INVALID_ID), + _commonStats(kStageType), + _updatedLocs(params.request->isMulti() ? new DiskLocSet() : NULL), + _doc(params.driver->getDocument()) { + // We are an update until we fall into the insert case. + params.driver->setContext(ModifierInterface::ExecInfo::UPDATE_CONTEXT); + + // Before we even start executing, we know whether or not this is a replacement + // style or $mod style update. + _specificStats.isDocReplacement = params.driver->isDocReplacement(); +} + +BSONObj UpdateStage::transformAndUpdate(const Snapshotted<BSONObj>& oldObj, RecordId& loc) { + const UpdateRequest* request = _params.request; + UpdateDriver* driver = _params.driver; + CanonicalQuery* cq = _params.canonicalQuery; + UpdateLifecycle* lifecycle = request->getLifecycle(); + + // If asked to return new doc, default to the oldObj, in case nothing changes. + BSONObj newObj = oldObj.value(); + + // Ask the driver to apply the mods. It may be that the driver can apply those "in + // place", that is, some values of the old document just get adjusted without any + // change to the binary layout on the bson layer. It may be that a whole new document + // is needed to accomodate the new bson layout of the resulting document. In any event, + // only enable in-place mutations if the underlying storage engine offers support for + // writing damage events. + _doc.reset(oldObj.value(), + (_collection->updateWithDamagesSupported() + ? mutablebson::Document::kInPlaceEnabled + : mutablebson::Document::kInPlaceDisabled)); + + BSONObj logObj; + + FieldRefSet updatedFields; + bool docWasModified = false; + + Status status = Status::OK(); + if (!driver->needMatchDetails()) { + // If we don't need match details, avoid doing the rematch + status = driver->update(StringData(), &_doc, &logObj, &updatedFields, &docWasModified); + } else { + // If there was a matched field, obtain it. + MatchDetails matchDetails; + matchDetails.requestElemMatchKey(); + + dassert(cq); + verify(cq->root()->matchesBSON(oldObj.value(), &matchDetails)); + + string matchedField; + if (matchDetails.hasElemMatchKey()) + matchedField = matchDetails.elemMatchKey(); + + // TODO: Right now, each mod checks in 'prepare' that if it needs positional + // data, that a non-empty StringData() was provided. In principle, we could do + // that check here in an else clause to the above conditional and remove the + // checks from the mods. + + status = driver->update(matchedField, &_doc, &logObj, &updatedFields, &docWasModified); + } - string matchedField; - if (matchDetails.hasElemMatchKey()) - matchedField = matchDetails.elemMatchKey(); + if (!status.isOK()) { + uasserted(16837, status.reason()); + } - // TODO: Right now, each mod checks in 'prepare' that if it needs positional - // data, that a non-empty StringData() was provided. In principle, we could do - // that check here in an else clause to the above conditional and remove the - // checks from the mods. + // Ensure _id exists and is first + uassertStatusOK(ensureIdAndFirst(_doc)); + + // See if the changes were applied in place + const char* source = NULL; + const bool inPlace = _doc.getInPlaceUpdates(&_damages, &source); + + if (inPlace && _damages.empty()) { + // An interesting edge case. A modifier didn't notice that it was really a no-op + // during its 'prepare' phase. That represents a missed optimization, but we still + // shouldn't do any real work. Toggle 'docWasModified' to 'false'. + // + // Currently, an example of this is '{ $pushAll : { x : [] } }' when the 'x' array + // exists. + docWasModified = false; + } - status = driver->update(matchedField, &_doc, &logObj, &updatedFields, &docWasModified); - } + if (docWasModified) { + // Verify that no immutable fields were changed and data is valid for storage. - if (!status.isOK()) { - uasserted(16837, status.reason()); - } + if (!(!_txn->writesAreReplicated() || request->isFromMigration())) { + const std::vector<FieldRef*>* immutableFields = NULL; + if (lifecycle) + immutableFields = lifecycle->getImmutableFields(); - // Ensure _id exists and is first - uassertStatusOK(ensureIdAndFirst(_doc)); - - // See if the changes were applied in place - const char* source = NULL; - const bool inPlace = _doc.getInPlaceUpdates(&_damages, &source); - - if (inPlace && _damages.empty()) { - // An interesting edge case. A modifier didn't notice that it was really a no-op - // during its 'prepare' phase. That represents a missed optimization, but we still - // shouldn't do any real work. Toggle 'docWasModified' to 'false'. - // - // Currently, an example of this is '{ $pushAll : { x : [] } }' when the 'x' array - // exists. - docWasModified = false; + uassertStatusOK(validate( + oldObj.value(), updatedFields, _doc, immutableFields, driver->modOptions())); } - if (docWasModified) { - - // Verify that no immutable fields were changed and data is valid for storage. - - if (!(!_txn->writesAreReplicated() || request->isFromMigration())) { - const std::vector<FieldRef*>* immutableFields = NULL; - if (lifecycle) - immutableFields = lifecycle->getImmutableFields(); - - uassertStatusOK(validate(oldObj.value(), - updatedFields, - _doc, - immutableFields, - driver->modOptions()) ); + // Prepare to write back the modified document + WriteUnitOfWork wunit(_txn); + + RecordId newLoc; + + if (inPlace) { + // Don't actually do the write if this is an explain. + if (!request->isExplain()) { + invariant(_collection); + newObj = oldObj.value(); + const RecordData oldRec(oldObj.value().objdata(), oldObj.value().objsize()); + BSONObj idQuery = driver->makeOplogEntryQuery(newObj, request->isMulti()); + oplogUpdateEntryArgs args; + args.update = logObj; + args.criteria = idQuery; + args.fromMigrate = request->isFromMigration(); + _collection->updateDocumentWithDamages( + _txn, + loc, + Snapshotted<RecordData>(oldObj.snapshotId(), oldRec), + source, + _damages, + args); } - // Prepare to write back the modified document - WriteUnitOfWork wunit(_txn); - - RecordId newLoc; - - if (inPlace) { - - // Don't actually do the write if this is an explain. - if (!request->isExplain()) { - invariant(_collection); - newObj = oldObj.value(); - const RecordData oldRec(oldObj.value().objdata(), oldObj.value().objsize()); - BSONObj idQuery = driver->makeOplogEntryQuery(newObj, request->isMulti()); - oplogUpdateEntryArgs args; - args.update = logObj; - args.criteria = idQuery; - args.fromMigrate = request->isFromMigration(); - _collection->updateDocumentWithDamages( - _txn, - loc, - Snapshotted<RecordData>(oldObj.snapshotId(), oldRec), - source, - _damages, - args); - } - - _specificStats.fastmod = true; - newLoc = loc; - } - else { - // The updates were not in place. Apply them through the file manager. - - newObj = _doc.getObject(); - uassert(17419, - str::stream() << "Resulting document after update is larger than " - << BSONObjMaxUserSize, - newObj.objsize() <= BSONObjMaxUserSize); - - // Don't actually do the write if this is an explain. - if (!request->isExplain()) { - invariant(_collection); - BSONObj idQuery = driver->makeOplogEntryQuery(newObj, request->isMulti()); - oplogUpdateEntryArgs args; - args.update = logObj; - args.criteria = idQuery; - args.fromMigrate = request->isFromMigration(); - StatusWith<RecordId> res = _collection->updateDocument( - _txn, - loc, - oldObj, - newObj, - true, - driver->modsAffectIndices(), - _params.opDebug, - args); - uassertStatusOK(res.getStatus()); - newLoc = res.getValue(); - } - } - - invariant(oldObj.snapshotId() == _txn->recoveryUnit()->getSnapshotId()); - wunit.commit(); - - // If the document moved, we might see it again in a collection scan (maybe it's - // a document after our current document). - // - // If the document is indexed and the mod changes an indexed value, we might see - // it again. For an example, see the comment above near declaration of - // updatedLocs. - // - // This must be done after the wunit commits so we are sure we won't be rolling back. - if (_updatedLocs && (newLoc != loc || driver->modsAffectIndices())) { - _updatedLocs->insert(newLoc); + _specificStats.fastmod = true; + newLoc = loc; + } else { + // The updates were not in place. Apply them through the file manager. + + newObj = _doc.getObject(); + uassert(17419, + str::stream() << "Resulting document after update is larger than " + << BSONObjMaxUserSize, + newObj.objsize() <= BSONObjMaxUserSize); + + // Don't actually do the write if this is an explain. + if (!request->isExplain()) { + invariant(_collection); + BSONObj idQuery = driver->makeOplogEntryQuery(newObj, request->isMulti()); + oplogUpdateEntryArgs args; + args.update = logObj; + args.criteria = idQuery; + args.fromMigrate = request->isFromMigration(); + StatusWith<RecordId> res = _collection->updateDocument(_txn, + loc, + oldObj, + newObj, + true, + driver->modsAffectIndices(), + _params.opDebug, + args); + uassertStatusOK(res.getStatus()); + newLoc = res.getValue(); } } - // Only record doc modifications if they wrote (exclude no-ops). Explains get - // recorded as if they wrote. - if (docWasModified || request->isExplain()) { - _specificStats.nModified++; + invariant(oldObj.snapshotId() == _txn->recoveryUnit()->getSnapshotId()); + wunit.commit(); + + // If the document moved, we might see it again in a collection scan (maybe it's + // a document after our current document). + // + // If the document is indexed and the mod changes an indexed value, we might see + // it again. For an example, see the comment above near declaration of + // updatedLocs. + // + // This must be done after the wunit commits so we are sure we won't be rolling back. + if (_updatedLocs && (newLoc != loc || driver->modsAffectIndices())) { + _updatedLocs->insert(newLoc); } - - return newObj; } - // static - Status UpdateStage::applyUpdateOpsForInsert(const CanonicalQuery* cq, - const BSONObj& query, - UpdateDriver* driver, - UpdateLifecycle* lifecycle, - mutablebson::Document* doc, - bool isInternalRequest, - UpdateStats* stats, - BSONObj* out) { - // Since this is an insert (no docs found and upsert:true), we will be logging it - // as an insert in the oplog. We don't need the driver's help to build the - // oplog record, then. We also set the context of the update driver to the INSERT_CONTEXT. - // Some mods may only work in that context (e.g. $setOnInsert). - driver->setLogOp(false); - driver->setContext(ModifierInterface::ExecInfo::INSERT_CONTEXT); - - const vector<FieldRef*>* immutablePaths = NULL; - if (!isInternalRequest && lifecycle) - immutablePaths = lifecycle->getImmutableFields(); - - // The original document we compare changes to - immutable paths must not change - BSONObj original; - - if (cq) { - Status status = driver->populateDocumentWithQueryFields(cq, immutablePaths, *doc); - if (!status.isOK()) { - return status; - } + // Only record doc modifications if they wrote (exclude no-ops). Explains get + // recorded as if they wrote. + if (docWasModified || request->isExplain()) { + _specificStats.nModified++; + } - if (driver->isDocReplacement()) - stats->fastmodinsert = true; - original = doc->getObject(); - } - else { - fassert(17354, CanonicalQuery::isSimpleIdQuery(query)); - BSONElement idElt = query[idFieldName]; - original = idElt.wrap(); - fassert(17352, doc->root().appendElement(idElt)); + return newObj; +} + +// static +Status UpdateStage::applyUpdateOpsForInsert(const CanonicalQuery* cq, + const BSONObj& query, + UpdateDriver* driver, + UpdateLifecycle* lifecycle, + mutablebson::Document* doc, + bool isInternalRequest, + UpdateStats* stats, + BSONObj* out) { + // Since this is an insert (no docs found and upsert:true), we will be logging it + // as an insert in the oplog. We don't need the driver's help to build the + // oplog record, then. We also set the context of the update driver to the INSERT_CONTEXT. + // Some mods may only work in that context (e.g. $setOnInsert). + driver->setLogOp(false); + driver->setContext(ModifierInterface::ExecInfo::INSERT_CONTEXT); + + const vector<FieldRef*>* immutablePaths = NULL; + if (!isInternalRequest && lifecycle) + immutablePaths = lifecycle->getImmutableFields(); + + // The original document we compare changes to - immutable paths must not change + BSONObj original; + + if (cq) { + Status status = driver->populateDocumentWithQueryFields(cq, immutablePaths, *doc); + if (!status.isOK()) { + return status; } - // Apply the update modifications here. - Status updateStatus = driver->update(StringData(), doc); - if (!updateStatus.isOK()) { - return Status(updateStatus.code(), updateStatus.reason(), 16836); - } + if (driver->isDocReplacement()) + stats->fastmodinsert = true; + original = doc->getObject(); + } else { + fassert(17354, CanonicalQuery::isSimpleIdQuery(query)); + BSONElement idElt = query[idFieldName]; + original = idElt.wrap(); + fassert(17352, doc->root().appendElement(idElt)); + } - // Ensure _id exists and is first - Status idAndFirstStatus = ensureIdAndFirst(*doc); - if (!idAndFirstStatus.isOK()) { - return idAndFirstStatus; - } + // Apply the update modifications here. + Status updateStatus = driver->update(StringData(), doc); + if (!updateStatus.isOK()) { + return Status(updateStatus.code(), updateStatus.reason(), 16836); + } - // Validate that the object replacement or modifiers resulted in a document - // that contains all the immutable keys and can be stored if it isn't coming - // from a migration or via replication. - if (!isInternalRequest) { - FieldRefSet noFields; - // This will only validate the modified fields if not a replacement. - Status validateStatus = validate(original, - noFields, - *doc, - immutablePaths, - driver->modOptions()); - if (!validateStatus.isOK()) { - return validateStatus; - } - } + // Ensure _id exists and is first + Status idAndFirstStatus = ensureIdAndFirst(*doc); + if (!idAndFirstStatus.isOK()) { + return idAndFirstStatus; + } - BSONObj newObj = doc->getObject(); - if (newObj.objsize() > BSONObjMaxUserSize) { - return Status(ErrorCodes::InvalidBSON, - str::stream() << "Document to upsert is larger than " - << BSONObjMaxUserSize, - 17420); + // Validate that the object replacement or modifiers resulted in a document + // that contains all the immutable keys and can be stored if it isn't coming + // from a migration or via replication. + if (!isInternalRequest) { + FieldRefSet noFields; + // This will only validate the modified fields if not a replacement. + Status validateStatus = + validate(original, noFields, *doc, immutablePaths, driver->modOptions()); + if (!validateStatus.isOK()) { + return validateStatus; } + } - *out = newObj; - return Status::OK(); + BSONObj newObj = doc->getObject(); + if (newObj.objsize() > BSONObjMaxUserSize) { + return Status(ErrorCodes::InvalidBSON, + str::stream() << "Document to upsert is larger than " << BSONObjMaxUserSize, + 17420); } - void UpdateStage::doInsert() { - _specificStats.inserted = true; + *out = newObj; + return Status::OK(); +} - const UpdateRequest* request = _params.request; - bool isInternalRequest = !_txn->writesAreReplicated() || request->isFromMigration(); +void UpdateStage::doInsert() { + _specificStats.inserted = true; - // Reset the document we will be writing to. - _doc.reset(); + const UpdateRequest* request = _params.request; + bool isInternalRequest = !_txn->writesAreReplicated() || request->isFromMigration(); - BSONObj newObj; - uassertStatusOK(applyUpdateOpsForInsert(_params.canonicalQuery, - request->getQuery(), - _params.driver, - request->getLifecycle(), - &_doc, - isInternalRequest, - &_specificStats, - &newObj)); + // Reset the document we will be writing to. + _doc.reset(); - _specificStats.objInserted = newObj; + BSONObj newObj; + uassertStatusOK(applyUpdateOpsForInsert(_params.canonicalQuery, + request->getQuery(), + _params.driver, + request->getLifecycle(), + &_doc, + isInternalRequest, + &_specificStats, + &newObj)); - // If this is an explain, bail out now without doing the insert. - if (request->isExplain()) { - return; - } - MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { - WriteUnitOfWork wunit(_txn); - invariant(_collection); - const bool enforceQuota = !request->isGod(); - uassertStatusOK(_collection->insertDocument(_txn, - newObj, - enforceQuota, - request->isFromMigration())); - - // Technically, we should save/restore state here, but since we are going to return - // immediately after, it would just be wasted work. - wunit.commit(); - } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(_txn, "upsert", _collection->ns().ns()); - } + _specificStats.objInserted = newObj; - bool UpdateStage::doneUpdating() { - // We're done updating if either the child has no more results to give us, or we've - // already gotten a result back and we're not a multi-update. - return _idRetrying == WorkingSet::INVALID_ID && _idReturning == WorkingSet::INVALID_ID - && (_child->isEOF() || (_specificStats.nMatched > 0 && !_params.request->isMulti())); + // If this is an explain, bail out now without doing the insert. + if (request->isExplain()) { + return; } + MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { + WriteUnitOfWork wunit(_txn); + invariant(_collection); + const bool enforceQuota = !request->isGod(); + uassertStatusOK( + _collection->insertDocument(_txn, newObj, enforceQuota, request->isFromMigration())); - bool UpdateStage::needInsert() { - // We need to insert if - // 1) we haven't inserted already, - // 2) the child stage returned zero matches, and - // 3) the user asked for an upsert. - return !_specificStats.inserted - && _specificStats.nMatched == 0 - && _params.request->isUpsert(); + // Technically, we should save/restore state here, but since we are going to return + // immediately after, it would just be wasted work. + wunit.commit(); } - - bool UpdateStage::isEOF() { - return doneUpdating() && !needInsert(); + MONGO_WRITE_CONFLICT_RETRY_LOOP_END(_txn, "upsert", _collection->ns().ns()); +} + +bool UpdateStage::doneUpdating() { + // We're done updating if either the child has no more results to give us, or we've + // already gotten a result back and we're not a multi-update. + return _idRetrying == WorkingSet::INVALID_ID && _idReturning == WorkingSet::INVALID_ID && + (_child->isEOF() || (_specificStats.nMatched > 0 && !_params.request->isMulti())); +} + +bool UpdateStage::needInsert() { + // We need to insert if + // 1) we haven't inserted already, + // 2) the child stage returned zero matches, and + // 3) the user asked for an upsert. + return !_specificStats.inserted && _specificStats.nMatched == 0 && _params.request->isUpsert(); +} + +bool UpdateStage::isEOF() { + return doneUpdating() && !needInsert(); +} + +PlanStage::StageState UpdateStage::work(WorkingSetID* out) { + ++_commonStats.works; + + // Adds the amount of time taken by work() to executionTimeMillis. + ScopedTimer timer(&_commonStats.executionTimeMillis); + + if (isEOF()) { + return PlanStage::IS_EOF; } - PlanStage::StageState UpdateStage::work(WorkingSetID* out) { - ++_commonStats.works; + if (doneUpdating()) { + // Even if we're done updating, we may have some inserting left to do. + if (needInsert()) { + // TODO we may want to handle WriteConflictException here. Currently we bounce it + // out to a higher level since if this WCEs it is likely that we raced with another + // upsert that may have matched our query, and therefore this may need to perform an + // update rather than an insert. Bouncing to the higher level allows restarting the + // query in this case. + doInsert(); - // Adds the amount of time taken by work() to executionTimeMillis. - ScopedTimer timer(&_commonStats.executionTimeMillis); + invariant(isEOF()); + if (_params.request->shouldReturnNewDocs()) { + // Want to return the document we just inserted, create it as a WorkingSetMember + // so that we can return it. + BSONObj newObj = _specificStats.objInserted; + *out = _ws->allocate(); + WorkingSetMember* member = _ws->get(*out); + member->obj = + Snapshotted<BSONObj>(_txn->recoveryUnit()->getSnapshotId(), newObj.getOwned()); + member->state = WorkingSetMember::OWNED_OBJ; + ++_commonStats.advanced; + return PlanStage::ADVANCED; + } + } - if (isEOF()) { return PlanStage::IS_EOF; } + // At this point either we're done updating and there was no insert to do, + // or we're done updating and we're done inserting. Either way, we're EOF. + invariant(isEOF()); + return PlanStage::IS_EOF; + } - if (doneUpdating()) { - // Even if we're done updating, we may have some inserting left to do. - if (needInsert()) { - // TODO we may want to handle WriteConflictException here. Currently we bounce it - // out to a higher level since if this WCEs it is likely that we raced with another - // upsert that may have matched our query, and therefore this may need to perform an - // update rather than an insert. Bouncing to the higher level allows restarting the - // query in this case. - doInsert(); + // If we're here, then we still have to ask for results from the child and apply + // updates to them. We should only get here if the collection exists. + invariant(_collection); - invariant(isEOF()); - if (_params.request->shouldReturnNewDocs()) { - // Want to return the document we just inserted, create it as a WorkingSetMember - // so that we can return it. - BSONObj newObj = _specificStats.objInserted; - *out = _ws->allocate(); - WorkingSetMember* member = _ws->get(*out); - member->obj = Snapshotted<BSONObj>(_txn->recoveryUnit()->getSnapshotId(), - newObj.getOwned()); - member->state = WorkingSetMember::OWNED_OBJ; - ++_commonStats.advanced; - return PlanStage::ADVANCED; - } - } + // It is possible that after an update was applied, a WriteConflictException + // occurred and prevented us from returning ADVANCED with the requested version + // of the document. + if (_idReturning != WorkingSet::INVALID_ID) { + // We should only get here if we were trying to return something before. + invariant(_params.request->shouldReturnAnyDocs()); - // At this point either we're done updating and there was no insert to do, - // or we're done updating and we're done inserting. Either way, we're EOF. - invariant(isEOF()); - return PlanStage::IS_EOF; - } + WorkingSetMember* member = _ws->get(_idReturning); + invariant(member->state == WorkingSetMember::OWNED_OBJ); - // If we're here, then we still have to ask for results from the child and apply - // updates to them. We should only get here if the collection exists. - invariant(_collection); + *out = _idReturning; + _idReturning = WorkingSet::INVALID_ID; + ++_commonStats.advanced; + return PlanStage::ADVANCED; + } - // It is possible that after an update was applied, a WriteConflictException - // occurred and prevented us from returning ADVANCED with the requested version - // of the document. - if (_idReturning != WorkingSet::INVALID_ID) { - // We should only get here if we were trying to return something before. - invariant(_params.request->shouldReturnAnyDocs()); + // Either retry the last WSM we worked on or get a new one from our child. + WorkingSetID id; + StageState status; + if (_idRetrying == WorkingSet::INVALID_ID) { + status = _child->work(&id); + } else { + status = ADVANCED; + id = _idRetrying; + _idRetrying = WorkingSet::INVALID_ID; + } - WorkingSetMember* member = _ws->get(_idReturning); - invariant(member->state == WorkingSetMember::OWNED_OBJ); + if (PlanStage::ADVANCED == status) { + // Need to get these things from the result returned by the child. + RecordId loc; - *out = _idReturning; - _idReturning = WorkingSet::INVALID_ID; - ++_commonStats.advanced; - return PlanStage::ADVANCED; - } + WorkingSetMember* member = _ws->get(id); - // Either retry the last WSM we worked on or get a new one from our child. - WorkingSetID id; - StageState status; - if (_idRetrying == WorkingSet::INVALID_ID) { - status = _child->work(&id); - } - else { - status = ADVANCED; - id = _idRetrying; - _idRetrying = WorkingSet::INVALID_ID; + // We want to free this member when we return, unless we need to retry it. + ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id); + + if (!member->hasLoc()) { + // We expect to be here because of an invalidation causing a force-fetch, and + // doc-locking storage engines do not issue invalidations. + ++_specificStats.nInvalidateSkips; + ++_commonStats.needTime; + return PlanStage::NEED_TIME; } + loc = member->loc; - if (PlanStage::ADVANCED == status) { - // Need to get these things from the result returned by the child. - RecordId loc; + // Updates can't have projections. This means that covering analysis will always add + // a fetch. We should always get fetched data, and never just key data. + invariant(member->hasObj()); - WorkingSetMember* member = _ws->get(id); + // We fill this with the new locs of moved doc so we don't double-update. + if (_updatedLocs && _updatedLocs->count(loc) > 0) { + // Found a loc that refers to a document we had already updated. Note that + // we can never remove from _updatedLocs because updates by other clients + // could cause us to encounter a document again later. + ++_commonStats.needTime; + return PlanStage::NEED_TIME; + } - // We want to free this member when we return, unless we need to retry it. - ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id); + try { + std::unique_ptr<RecordCursor> cursor; + if (_txn->recoveryUnit()->getSnapshotId() != member->obj.snapshotId()) { + cursor = _collection->getCursor(_txn); + // our snapshot has changed, refetch + if (!WorkingSetCommon::fetch(_txn, member, cursor)) { + // document was deleted, we're done here + ++_commonStats.needTime; + return PlanStage::NEED_TIME; + } - if (!member->hasLoc()) { - // We expect to be here because of an invalidation causing a force-fetch, and - // doc-locking storage engines do not issue invalidations. - ++_specificStats.nInvalidateSkips; - ++_commonStats.needTime; - return PlanStage::NEED_TIME; - } - loc = member->loc; - - // Updates can't have projections. This means that covering analysis will always add - // a fetch. We should always get fetched data, and never just key data. - invariant(member->hasObj()); - - // We fill this with the new locs of moved doc so we don't double-update. - if (_updatedLocs && _updatedLocs->count(loc) > 0) { - // Found a loc that refers to a document we had already updated. Note that - // we can never remove from _updatedLocs because updates by other clients - // could cause us to encounter a document again later. - ++_commonStats.needTime; - return PlanStage::NEED_TIME; + // we have to re-match the doc as it might not match anymore + CanonicalQuery* cq = _params.canonicalQuery; + if (cq && !cq->root()->matchesBSON(member->obj.value(), NULL)) { + // doesn't match predicates anymore! + ++_commonStats.needTime; + return PlanStage::NEED_TIME; + } } + // Save state before making changes try { - std::unique_ptr<RecordCursor> cursor; - if (_txn->recoveryUnit()->getSnapshotId() != member->obj.snapshotId()) { - cursor = _collection->getCursor(_txn); - // our snapshot has changed, refetch - if (!WorkingSetCommon::fetch(_txn, member, cursor)) { - // document was deleted, we're done here - ++_commonStats.needTime; - return PlanStage::NEED_TIME; - } - - // we have to re-match the doc as it might not match anymore - CanonicalQuery* cq = _params.canonicalQuery; - if (cq && !cq->root()->matchesBSON(member->obj.value(), NULL)) { - // doesn't match predicates anymore! - ++_commonStats.needTime; - return PlanStage::NEED_TIME; - } + _child->saveState(); + if (supportsDocLocking()) { + // Doc-locking engines require this after saveState() since they don't use + // invalidations. + WorkingSetCommon::prepareForSnapshotChange(_ws); } + } catch (const WriteConflictException& wce) { + std::terminate(); + } - // Save state before making changes - try { - _child->saveState(); - if (supportsDocLocking()) { - // Doc-locking engines require this after saveState() since they don't use - // invalidations. - WorkingSetCommon::prepareForSnapshotChange(_ws); - } - } - catch ( const WriteConflictException& wce ) { - std::terminate(); - } + // If we care about the pre-updated version of the doc, save it out here. + BSONObj oldObj; + if (_params.request->shouldReturnOldDocs()) { + oldObj = member->obj.value().getOwned(); + } - // If we care about the pre-updated version of the doc, save it out here. - BSONObj oldObj; - if (_params.request->shouldReturnOldDocs()) { - oldObj = member->obj.value().getOwned(); - } + // Do the update, get us the new version of the doc. + BSONObj newObj = transformAndUpdate(member->obj, loc); - // Do the update, get us the new version of the doc. - BSONObj newObj = transformAndUpdate(member->obj, loc); - - // Set member's obj to be the doc we want to return. - if (_params.request->shouldReturnAnyDocs()) { - if (_params.request->shouldReturnNewDocs()) { - member->obj = Snapshotted<BSONObj>(_txn->recoveryUnit()->getSnapshotId(), - newObj.getOwned()); - } - else { - invariant(_params.request->shouldReturnOldDocs()); - member->obj.setValue(oldObj); - } - member->loc = RecordId(); - member->state = WorkingSetMember::OWNED_OBJ; + // Set member's obj to be the doc we want to return. + if (_params.request->shouldReturnAnyDocs()) { + if (_params.request->shouldReturnNewDocs()) { + member->obj = Snapshotted<BSONObj>(_txn->recoveryUnit()->getSnapshotId(), + newObj.getOwned()); + } else { + invariant(_params.request->shouldReturnOldDocs()); + member->obj.setValue(oldObj); } + member->loc = RecordId(); + member->state = WorkingSetMember::OWNED_OBJ; } - catch ( const WriteConflictException& wce ) { - _idRetrying = id; - memberFreer.Dismiss(); // Keep this member around so we can retry updating it. - *out = WorkingSet::INVALID_ID; - _commonStats.needYield++; - return NEED_YIELD; - } - - // This should be after transformAndUpdate to make sure we actually updated this doc. - ++_specificStats.nMatched; + } catch (const WriteConflictException& wce) { + _idRetrying = id; + memberFreer.Dismiss(); // Keep this member around so we can retry updating it. + *out = WorkingSet::INVALID_ID; + _commonStats.needYield++; + return NEED_YIELD; + } - // Restore state after modification + // This should be after transformAndUpdate to make sure we actually updated this doc. + ++_specificStats.nMatched; - // As restoreState may restore (recreate) cursors, make sure to restore the - // state outside of the WritUnitOfWork. - try { - _child->restoreState(_txn); - } - catch ( const WriteConflictException& wce ) { - // Note we don't need to retry updating anything in this case since the update - // already was committed. However, we still need to return the updated document - // (if it was requested). - if (_params.request->shouldReturnAnyDocs()) { - // member->obj should refer to the document we want to return. - invariant(member->state == WorkingSetMember::OWNED_OBJ); - - _idReturning = id; - // Keep this member around so that we can return it on the next work() call. - memberFreer.Dismiss(); - } - *out = WorkingSet::INVALID_ID; - _commonStats.needYield++; - return NEED_YIELD; - } + // Restore state after modification + // As restoreState may restore (recreate) cursors, make sure to restore the + // state outside of the WritUnitOfWork. + try { + _child->restoreState(_txn); + } catch (const WriteConflictException& wce) { + // Note we don't need to retry updating anything in this case since the update + // already was committed. However, we still need to return the updated document + // (if it was requested). if (_params.request->shouldReturnAnyDocs()) { // member->obj should refer to the document we want to return. invariant(member->state == WorkingSetMember::OWNED_OBJ); - memberFreer.Dismiss(); // Keep this member around so we can return it. - *out = id; - ++_commonStats.advanced; - return PlanStage::ADVANCED; - } - - ++_commonStats.needTime; - return PlanStage::NEED_TIME; - } - else if (PlanStage::IS_EOF == status) { - // The child is out of results, but we might not be done yet because we still might - // have to do an insert. - ++_commonStats.needTime; - return PlanStage::NEED_TIME; - } - else if (PlanStage::FAILURE == status) { - *out = id; - // If a stage fails, it may create a status WSM to indicate why it failed, in which case - // 'id' is valid. If ID is invalid, we create our own error message. - if (WorkingSet::INVALID_ID == id) { - const std::string errmsg = "update stage failed to read in results from child"; - *out = WorkingSetCommon::allocateStatusMember(_ws, Status(ErrorCodes::InternalError, - errmsg)); - return PlanStage::FAILURE; + _idReturning = id; + // Keep this member around so that we can return it on the next work() call. + memberFreer.Dismiss(); } - return status; - } - else if (PlanStage::NEED_TIME == status) { - ++_commonStats.needTime; + *out = WorkingSet::INVALID_ID; + _commonStats.needYield++; + return NEED_YIELD; } - else if (PlanStage::NEED_YIELD == status) { - ++_commonStats.needYield; + + if (_params.request->shouldReturnAnyDocs()) { + // member->obj should refer to the document we want to return. + invariant(member->state == WorkingSetMember::OWNED_OBJ); + + memberFreer.Dismiss(); // Keep this member around so we can return it. *out = id; + ++_commonStats.advanced; + return PlanStage::ADVANCED; } + ++_commonStats.needTime; + return PlanStage::NEED_TIME; + } else if (PlanStage::IS_EOF == status) { + // The child is out of results, but we might not be done yet because we still might + // have to do an insert. + ++_commonStats.needTime; + return PlanStage::NEED_TIME; + } else if (PlanStage::FAILURE == status) { + *out = id; + // If a stage fails, it may create a status WSM to indicate why it failed, in which case + // 'id' is valid. If ID is invalid, we create our own error message. + if (WorkingSet::INVALID_ID == id) { + const std::string errmsg = "update stage failed to read in results from child"; + *out = WorkingSetCommon::allocateStatusMember( + _ws, Status(ErrorCodes::InternalError, errmsg)); + return PlanStage::FAILURE; + } return status; + } else if (PlanStage::NEED_TIME == status) { + ++_commonStats.needTime; + } else if (PlanStage::NEED_YIELD == status) { + ++_commonStats.needYield; + *out = id; } - void UpdateStage::saveState() { - _txn = NULL; - ++_commonStats.yields; - _child->saveState(); - } - - Status UpdateStage::restoreUpdateState(OperationContext* opCtx) { - const UpdateRequest& request = *_params.request; - const NamespaceString& nsString(request.getNamespaceString()); - - // We may have stepped down during the yield. - bool userInitiatedWritesAndNotPrimary = opCtx->writesAreReplicated() && - !repl::getGlobalReplicationCoordinator()->canAcceptWritesFor(nsString); + return status; +} - if (userInitiatedWritesAndNotPrimary) { - return Status(ErrorCodes::NotMaster, - str::stream() << "Demoted from primary while performing update on " - << nsString.ns()); - } +void UpdateStage::saveState() { + _txn = NULL; + ++_commonStats.yields; + _child->saveState(); +} - if (request.getLifecycle()) { - UpdateLifecycle* lifecycle = request.getLifecycle(); - lifecycle->setCollection(_collection); +Status UpdateStage::restoreUpdateState(OperationContext* opCtx) { + const UpdateRequest& request = *_params.request; + const NamespaceString& nsString(request.getNamespaceString()); - if (!lifecycle->canContinue()) { - return Status(ErrorCodes::IllegalOperation, - "Update aborted due to invalid state transitions after yield.", - 17270); - } + // We may have stepped down during the yield. + bool userInitiatedWritesAndNotPrimary = opCtx->writesAreReplicated() && + !repl::getGlobalReplicationCoordinator()->canAcceptWritesFor(nsString); - _params.driver->refreshIndexKeys(lifecycle->getIndexKeys(opCtx)); - } - - return Status::OK(); - } - - void UpdateStage::restoreState(OperationContext* opCtx) { - invariant(_txn == NULL); - _txn = opCtx; - ++_commonStats.unyields; - // Restore our child. - _child->restoreState(opCtx); - // Restore self. - uassertStatusOK(restoreUpdateState(opCtx)); + if (userInitiatedWritesAndNotPrimary) { + return Status(ErrorCodes::NotMaster, + str::stream() << "Demoted from primary while performing update on " + << nsString.ns()); } - void UpdateStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { - ++_commonStats.invalidates; - _child->invalidate(txn, dl, type); - } + if (request.getLifecycle()) { + UpdateLifecycle* lifecycle = request.getLifecycle(); + lifecycle->setCollection(_collection); - vector<PlanStage*> UpdateStage::getChildren() const { - vector<PlanStage*> children; - children.push_back(_child.get()); - return children; - } - - PlanStageStats* UpdateStage::getStats() { - _commonStats.isEOF = isEOF(); - unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_UPDATE)); - ret->specific.reset(new UpdateStats(_specificStats)); - ret->children.push_back(_child->getStats()); - return ret.release(); - } + if (!lifecycle->canContinue()) { + return Status(ErrorCodes::IllegalOperation, + "Update aborted due to invalid state transitions after yield.", + 17270); + } - const CommonStats* UpdateStage::getCommonStats() const { - return &_commonStats; + _params.driver->refreshIndexKeys(lifecycle->getIndexKeys(opCtx)); } - const SpecificStats* UpdateStage::getSpecificStats() const { - return &_specificStats; + return Status::OK(); +} + +void UpdateStage::restoreState(OperationContext* opCtx) { + invariant(_txn == NULL); + _txn = opCtx; + ++_commonStats.unyields; + // Restore our child. + _child->restoreState(opCtx); + // Restore self. + uassertStatusOK(restoreUpdateState(opCtx)); +} + +void UpdateStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { + ++_commonStats.invalidates; + _child->invalidate(txn, dl, type); +} + +vector<PlanStage*> UpdateStage::getChildren() const { + vector<PlanStage*> children; + children.push_back(_child.get()); + return children; +} + +PlanStageStats* UpdateStage::getStats() { + _commonStats.isEOF = isEOF(); + unique_ptr<PlanStageStats> ret(new PlanStageStats(_commonStats, STAGE_UPDATE)); + ret->specific.reset(new UpdateStats(_specificStats)); + ret->children.push_back(_child->getStats()); + return ret.release(); +} + +const CommonStats* UpdateStage::getCommonStats() const { + return &_commonStats; +} + +const SpecificStats* UpdateStage::getSpecificStats() const { + return &_specificStats; +} + +// static +UpdateResult UpdateStage::makeUpdateResult(PlanExecutor* exec, OpDebug* opDebug) { + // Get stats from the root stage. + invariant(exec->getRootStage()->isEOF()); + invariant(exec->getRootStage()->stageType() == STAGE_UPDATE); + UpdateStage* updateStage = static_cast<UpdateStage*>(exec->getRootStage()); + const UpdateStats* updateStats = + static_cast<const UpdateStats*>(updateStage->getSpecificStats()); + + // Use stats from the root stage to fill out opDebug. + opDebug->nMatched = updateStats->nMatched; + opDebug->nModified = updateStats->nModified; + opDebug->upsert = updateStats->inserted; + opDebug->fastmodinsert = updateStats->fastmodinsert; + opDebug->fastmod = updateStats->fastmod; + + // Historically, 'opDebug' considers 'nMatched' and 'nModified' to be 1 (rather than 0) + // if there is an upsert that inserts a document. The UpdateStage does not participate + // in this madness in order to have saner stats reporting for explain. This means that + // we have to set these values "manually" in the case of an insert. + if (updateStats->inserted) { + opDebug->nMatched = 1; + opDebug->nModified = 1; } - // static - UpdateResult UpdateStage::makeUpdateResult(PlanExecutor* exec, OpDebug* opDebug) { - // Get stats from the root stage. - invariant(exec->getRootStage()->isEOF()); - invariant(exec->getRootStage()->stageType() == STAGE_UPDATE); - UpdateStage* updateStage = static_cast<UpdateStage*>(exec->getRootStage()); - const UpdateStats* updateStats = - static_cast<const UpdateStats*>(updateStage->getSpecificStats()); - - // Use stats from the root stage to fill out opDebug. - opDebug->nMatched = updateStats->nMatched; - opDebug->nModified = updateStats->nModified; - opDebug->upsert = updateStats->inserted; - opDebug->fastmodinsert = updateStats->fastmodinsert; - opDebug->fastmod = updateStats->fastmod; - - // Historically, 'opDebug' considers 'nMatched' and 'nModified' to be 1 (rather than 0) - // if there is an upsert that inserts a document. The UpdateStage does not participate - // in this madness in order to have saner stats reporting for explain. This means that - // we have to set these values "manually" in the case of an insert. - if (updateStats->inserted) { - opDebug->nMatched = 1; - opDebug->nModified = 1; - } - - // Get summary information about the plan. - PlanSummaryStats stats; - Explain::getSummaryStats(exec, &stats); - opDebug->nscanned = stats.totalKeysExamined; - opDebug->nscannedObjects = stats.totalDocsExamined; + // Get summary information about the plan. + PlanSummaryStats stats; + Explain::getSummaryStats(exec, &stats); + opDebug->nscanned = stats.totalKeysExamined; + opDebug->nscannedObjects = stats.totalDocsExamined; - return UpdateResult(updateStats->nMatched > 0 /* Did we update at least one obj? */, - !updateStats->isDocReplacement /* $mod or obj replacement */, - opDebug->nModified /* number of modified docs, no no-ops */, - opDebug->nMatched /* # of docs matched/updated, even no-ops */, - updateStats->objInserted); - }; + return UpdateResult(updateStats->nMatched > 0 /* Did we update at least one obj? */, + !updateStats->isDocReplacement /* $mod or obj replacement */, + opDebug->nModified /* number of modified docs, no no-ops */, + opDebug->nMatched /* # of docs matched/updated, even no-ops */, + updateStats->objInserted); +}; -} // namespace mongo +} // namespace mongo diff --git a/src/mongo/db/exec/update.h b/src/mongo/db/exec/update.h index 8460846aca1..28ff014b232 100644 --- a/src/mongo/db/exec/update.h +++ b/src/mongo/db/exec/update.h @@ -38,184 +38,181 @@ namespace mongo { - class OperationContext; +class OperationContext; - struct UpdateStageParams { +struct UpdateStageParams { + UpdateStageParams(const UpdateRequest* r, UpdateDriver* d, OpDebug* o) + : request(r), driver(d), opDebug(o), canonicalQuery(NULL) {} - UpdateStageParams(const UpdateRequest* r, - UpdateDriver* d, - OpDebug* o) - : request(r), - driver(d), - opDebug(o), - canonicalQuery(NULL) { } + // Contains update parameters like whether it's a multi update or an upsert. Not owned. + // Must outlive the UpdateStage. + const UpdateRequest* request; - // Contains update parameters like whether it's a multi update or an upsert. Not owned. - // Must outlive the UpdateStage. - const UpdateRequest* request; + // Contains the logic for applying mods to documents. Not owned. Must outlive + // the UpdateStage. + UpdateDriver* driver; - // Contains the logic for applying mods to documents. Not owned. Must outlive - // the UpdateStage. - UpdateDriver* driver; + // Needed to pass to Collection::updateDocument(...). + OpDebug* opDebug; - // Needed to pass to Collection::updateDocument(...). - OpDebug* opDebug; + // Not owned here. + CanonicalQuery* canonicalQuery; - // Not owned here. - CanonicalQuery* canonicalQuery; +private: + // Default constructor not allowed. + UpdateStageParams(); +}; - private: - // Default constructor not allowed. - UpdateStageParams(); - }; +/** + * Execution stage responsible for updates to documents and upserts. If the prior or + * newly-updated version of the document was requested to be returned, then ADVANCED is + * returned after updating or inserting a document. Otherwise, NEED_TIME is returned after + * updating or inserting a document. + * + * Callers of work() must be holding a write lock. + */ +class UpdateStage : public PlanStage { + MONGO_DISALLOW_COPYING(UpdateStage); + +public: + UpdateStage(OperationContext* txn, + const UpdateStageParams& params, + WorkingSet* ws, + Collection* collection, + PlanStage* child); + + virtual bool isEOF(); + virtual StageState work(WorkingSetID* out); + + virtual void saveState(); + virtual void restoreState(OperationContext* opCtx); + virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); + + virtual std::vector<PlanStage*> getChildren() const; + + virtual StageType stageType() const { + return STAGE_UPDATE; + } + + virtual PlanStageStats* getStats(); + + virtual const CommonStats* getCommonStats() const; + + virtual const SpecificStats* getSpecificStats() const; + + static const char* kStageType; /** - * Execution stage responsible for updates to documents and upserts. If the prior or - * newly-updated version of the document was requested to be returned, then ADVANCED is - * returned after updating or inserting a document. Otherwise, NEED_TIME is returned after - * updating or inserting a document. + * Converts the execution stats (stored by the update stage as an UpdateStats) for the + * update plan represented by 'exec' into the UpdateResult format used to report the results + * of writes. + * + * Also responsible for filling out 'opDebug' with execution info. + * + * Should only be called once this stage is EOF. + */ + static UpdateResult makeUpdateResult(PlanExecutor* exec, OpDebug* opDebug); + + /** + * Computes the document to insert if the upsert flag is set to true and no matching + * documents are found in the database. The document to upsert is computing using the + * query 'cq' and the update mods contained in 'driver'. + * + * If 'cq' is NULL, which can happen for the idhack update fast path, then 'query' is + * used to compute the doc to insert instead of 'cq'. * - * Callers of work() must be holding a write lock. + * 'doc' is the mutable BSON document which you would like the update driver to use + * when computing the document to insert. + * + * Set 'isInternalRequest' to true if the upsert was issued by the replication or + * sharding systems. + * + * Fills out whether or not this is a fastmodinsert in 'stats'. + * + * Returns the document to insert in *out. + */ + static Status applyUpdateOpsForInsert(const CanonicalQuery* cq, + const BSONObj& query, + UpdateDriver* driver, + UpdateLifecycle* lifecycle, + mutablebson::Document* doc, + bool isInternalRequest, + UpdateStats* stats, + BSONObj* out); + +private: + /** + * Computes the result of applying mods to the document 'oldObj' at RecordId 'loc' in + * memory, then commits these changes to the database. Returns a possibly unowned copy + * of the newly-updated version of the document. + */ + BSONObj transformAndUpdate(const Snapshotted<BSONObj>& oldObj, RecordId& loc); + + /** + * Computes the document to insert and inserts it into the collection. Used if the + * user requested an upsert and no matching documents were found. + */ + void doInsert(); + + /** + * Have we performed all necessary updates? Even if this is true, we might not be EOF, + * as we might still have to do an insert. + */ + bool doneUpdating(); + + /** + * Examines the stats / update request and returns whether there is still an insert left + * to do. If so then this stage is not EOF yet. + */ + bool needInsert(); + + /** + * Helper for restoring the state of this update. */ - class UpdateStage : public PlanStage { - MONGO_DISALLOW_COPYING(UpdateStage); - public: - UpdateStage(OperationContext* txn, - const UpdateStageParams& params, - WorkingSet* ws, - Collection* collection, - PlanStage* child); - - virtual bool isEOF(); - virtual StageState work(WorkingSetID* out); - - virtual void saveState(); - virtual void restoreState(OperationContext* opCtx); - virtual void invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type); - - virtual std::vector<PlanStage*> getChildren() const; - - virtual StageType stageType() const { return STAGE_UPDATE; } - - virtual PlanStageStats* getStats(); - - virtual const CommonStats* getCommonStats() const; - - virtual const SpecificStats* getSpecificStats() const; - - static const char* kStageType; - - /** - * Converts the execution stats (stored by the update stage as an UpdateStats) for the - * update plan represented by 'exec' into the UpdateResult format used to report the results - * of writes. - * - * Also responsible for filling out 'opDebug' with execution info. - * - * Should only be called once this stage is EOF. - */ - static UpdateResult makeUpdateResult(PlanExecutor* exec, OpDebug* opDebug); - - /** - * Computes the document to insert if the upsert flag is set to true and no matching - * documents are found in the database. The document to upsert is computing using the - * query 'cq' and the update mods contained in 'driver'. - * - * If 'cq' is NULL, which can happen for the idhack update fast path, then 'query' is - * used to compute the doc to insert instead of 'cq'. - * - * 'doc' is the mutable BSON document which you would like the update driver to use - * when computing the document to insert. - * - * Set 'isInternalRequest' to true if the upsert was issued by the replication or - * sharding systems. - * - * Fills out whether or not this is a fastmodinsert in 'stats'. - * - * Returns the document to insert in *out. - */ - static Status applyUpdateOpsForInsert(const CanonicalQuery* cq, - const BSONObj& query, - UpdateDriver* driver, - UpdateLifecycle* lifecycle, - mutablebson::Document* doc, - bool isInternalRequest, - UpdateStats* stats, - BSONObj* out); - - private: - /** - * Computes the result of applying mods to the document 'oldObj' at RecordId 'loc' in - * memory, then commits these changes to the database. Returns a possibly unowned copy - * of the newly-updated version of the document. - */ - BSONObj transformAndUpdate(const Snapshotted<BSONObj>& oldObj, RecordId& loc); - - /** - * Computes the document to insert and inserts it into the collection. Used if the - * user requested an upsert and no matching documents were found. - */ - void doInsert(); - - /** - * Have we performed all necessary updates? Even if this is true, we might not be EOF, - * as we might still have to do an insert. - */ - bool doneUpdating(); - - /** - * Examines the stats / update request and returns whether there is still an insert left - * to do. If so then this stage is not EOF yet. - */ - bool needInsert(); - - /** - * Helper for restoring the state of this update. - */ - Status restoreUpdateState(OperationContext* opCtx); - - // Transactional context. Not owned by us. - OperationContext* _txn; - - UpdateStageParams _params; - - // Not owned by us. - WorkingSet* _ws; - - // Not owned by us. May be NULL. - Collection* _collection; - - // Owned by us. - std::unique_ptr<PlanStage> _child; - - // If not WorkingSet::INVALID_ID, we use this rather than asking our child what to do next. - WorkingSetID _idRetrying; - - // If not WorkingSet::INVALID_ID, we return this member to our caller. - WorkingSetID _idReturning; - - // Stats - CommonStats _commonStats; - UpdateStats _specificStats; - - // If the update was in-place, we may see it again. This only matters if we're doing - // a multi-update; if we're not doing a multi-update we stop after one update and we - // won't see any more docs. - // - // For example: If we're scanning an index {x:1} and performing {$inc:{x:5}}, we'll keep - // moving the document forward and it will continue to reappear in our index scan. - // Unless the index is multikey, the underlying query machinery won't de-dup. - // - // If the update wasn't in-place we may see it again. Our query may return the new - // document and we wouldn't want to update that. - // - // So, no matter what, we keep track of where the doc wound up. - typedef unordered_set<RecordId, RecordId::Hasher> DiskLocSet; - const std::unique_ptr<DiskLocSet> _updatedLocs; - - // These get reused for each update. - mutablebson::Document& _doc; - mutablebson::DamageVector _damages; - }; + Status restoreUpdateState(OperationContext* opCtx); + + // Transactional context. Not owned by us. + OperationContext* _txn; + + UpdateStageParams _params; + + // Not owned by us. + WorkingSet* _ws; + + // Not owned by us. May be NULL. + Collection* _collection; + + // Owned by us. + std::unique_ptr<PlanStage> _child; + + // If not WorkingSet::INVALID_ID, we use this rather than asking our child what to do next. + WorkingSetID _idRetrying; + + // If not WorkingSet::INVALID_ID, we return this member to our caller. + WorkingSetID _idReturning; + + // Stats + CommonStats _commonStats; + UpdateStats _specificStats; + + // If the update was in-place, we may see it again. This only matters if we're doing + // a multi-update; if we're not doing a multi-update we stop after one update and we + // won't see any more docs. + // + // For example: If we're scanning an index {x:1} and performing {$inc:{x:5}}, we'll keep + // moving the document forward and it will continue to reappear in our index scan. + // Unless the index is multikey, the underlying query machinery won't de-dup. + // + // If the update wasn't in-place we may see it again. Our query may return the new + // document and we wouldn't want to update that. + // + // So, no matter what, we keep track of where the doc wound up. + typedef unordered_set<RecordId, RecordId::Hasher> DiskLocSet; + const std::unique_ptr<DiskLocSet> _updatedLocs; + + // These get reused for each update. + mutablebson::Document& _doc; + mutablebson::DamageVector _damages; +}; } // namespace mongo diff --git a/src/mongo/db/exec/working_set.cpp b/src/mongo/db/exec/working_set.cpp index 746ace2f3ca..5e531af346a 100644 --- a/src/mongo/db/exec/working_set.cpp +++ b/src/mongo/db/exec/working_set.cpp @@ -33,248 +33,247 @@ namespace mongo { - using std::string; +using std::string; - WorkingSet::MemberHolder::MemberHolder() : member(NULL) { } - WorkingSet::MemberHolder::~MemberHolder() {} +WorkingSet::MemberHolder::MemberHolder() : member(NULL) {} +WorkingSet::MemberHolder::~MemberHolder() {} - WorkingSet::WorkingSet() : _freeList(INVALID_ID) { } +WorkingSet::WorkingSet() : _freeList(INVALID_ID) {} - WorkingSet::~WorkingSet() { - for (size_t i = 0; i < _data.size(); i++) { - delete _data[i].member; - } +WorkingSet::~WorkingSet() { + for (size_t i = 0; i < _data.size(); i++) { + delete _data[i].member; } - - WorkingSetID WorkingSet::allocate() { - if (_freeList == INVALID_ID) { - // The free list is empty so we need to make a single new WSM to return. This relies on - // vector::resize being amortized O(1) for efficient allocation. Note that the free list - // remains empty until something is returned by a call to free(). - WorkingSetID id = _data.size(); - _data.resize(_data.size() + 1); - _data.back().nextFreeOrSelf = id; - _data.back().member = new WorkingSetMember(); - return id; - } - - // Pop the head off the free list and return it. - WorkingSetID id = _freeList; - _freeList = _data[id].nextFreeOrSelf; - _data[id].nextFreeOrSelf = id; // set to self to mark as in-use +} + +WorkingSetID WorkingSet::allocate() { + if (_freeList == INVALID_ID) { + // The free list is empty so we need to make a single new WSM to return. This relies on + // vector::resize being amortized O(1) for efficient allocation. Note that the free list + // remains empty until something is returned by a call to free(). + WorkingSetID id = _data.size(); + _data.resize(_data.size() + 1); + _data.back().nextFreeOrSelf = id; + _data.back().member = new WorkingSetMember(); return id; } - void WorkingSet::free(const WorkingSetID& i) { - MemberHolder& holder = _data[i]; - verify(i < _data.size()); // ID has been allocated. - verify(holder.nextFreeOrSelf == i); // ID currently in use. - - // Free resources and push this WSM to the head of the freelist. - holder.member->clear(); - holder.nextFreeOrSelf = _freeList; - _freeList = i; - } - - void WorkingSet::flagForReview(const WorkingSetID& i) { - WorkingSetMember* member = get(i); - verify(WorkingSetMember::OWNED_OBJ == member->state); - _flagged.insert(i); + // Pop the head off the free list and return it. + WorkingSetID id = _freeList; + _freeList = _data[id].nextFreeOrSelf; + _data[id].nextFreeOrSelf = id; // set to self to mark as in-use + return id; +} + +void WorkingSet::free(const WorkingSetID& i) { + MemberHolder& holder = _data[i]; + verify(i < _data.size()); // ID has been allocated. + verify(holder.nextFreeOrSelf == i); // ID currently in use. + + // Free resources and push this WSM to the head of the freelist. + holder.member->clear(); + holder.nextFreeOrSelf = _freeList; + _freeList = i; +} + +void WorkingSet::flagForReview(const WorkingSetID& i) { + WorkingSetMember* member = get(i); + verify(WorkingSetMember::OWNED_OBJ == member->state); + _flagged.insert(i); +} + +const unordered_set<WorkingSetID>& WorkingSet::getFlagged() const { + return _flagged; +} + +bool WorkingSet::isFlagged(WorkingSetID id) const { + invariant(id < _data.size()); + return _flagged.end() != _flagged.find(id); +} + +void WorkingSet::clear() { + for (size_t i = 0; i < _data.size(); i++) { + delete _data[i].member; } + _data.clear(); - const unordered_set<WorkingSetID>& WorkingSet::getFlagged() const { - return _flagged; - } - - bool WorkingSet::isFlagged(WorkingSetID id) const { - invariant(id < _data.size()); - return _flagged.end() != _flagged.find(id); - } + // Since working set is now empty, the free list pointer should + // point to nothing. + _freeList = INVALID_ID; - void WorkingSet::clear() { - for (size_t i = 0; i < _data.size(); i++) { - delete _data[i].member; - } - _data.clear(); + _flagged.clear(); +} - // Since working set is now empty, the free list pointer should - // point to nothing. - _freeList = INVALID_ID; +// +// Iteration +// - _flagged.clear(); +WorkingSet::iterator::iterator(WorkingSet* ws, size_t index) : _ws(ws), _index(index) { + // If we're currently not pointing at an allocated member, then we have + // to advance to the first one, unless we're already at the end. + if (_index < _ws->_data.size() && isFree()) { + advance(); } +} - // - // Iteration - // - - WorkingSet::iterator::iterator(WorkingSet* ws, size_t index) - : _ws(ws), - _index(index) { - // If we're currently not pointing at an allocated member, then we have - // to advance to the first one, unless we're already at the end. - if (_index < _ws->_data.size() && isFree()) { - advance(); - } - } +void WorkingSet::iterator::advance() { + // Move forward at least once in the data list. + _index++; - void WorkingSet::iterator::advance() { - // Move forward at least once in the data list. + // While we haven't hit the end and the current member is not in use. (Skips ahead until + // we find the next allocated member.) + while (_index < _ws->_data.size() && isFree()) { _index++; - - // While we haven't hit the end and the current member is not in use. (Skips ahead until - // we find the next allocated member.) - while (_index < _ws->_data.size() && isFree()) { - _index++; - } } +} - bool WorkingSet::iterator::isFree() const { - return _ws->_data[_index].nextFreeOrSelf != _index; - } +bool WorkingSet::iterator::isFree() const { + return _ws->_data[_index].nextFreeOrSelf != _index; +} - void WorkingSet::iterator::free() { - dassert(!isFree()); - _ws->free(_index); - } +void WorkingSet::iterator::free() { + dassert(!isFree()); + _ws->free(_index); +} - void WorkingSet::iterator::operator++() { - dassert(_index < _ws->_data.size()); - advance(); - } +void WorkingSet::iterator::operator++() { + dassert(_index < _ws->_data.size()); + advance(); +} - bool WorkingSet::iterator::operator==(const WorkingSet::iterator& other) const { - return (_index == other._index); - } +bool WorkingSet::iterator::operator==(const WorkingSet::iterator& other) const { + return (_index == other._index); +} - bool WorkingSet::iterator::operator!=(const WorkingSet::iterator& other) const { - return (_index != other._index); - } +bool WorkingSet::iterator::operator!=(const WorkingSet::iterator& other) const { + return (_index != other._index); +} - WorkingSetMember& WorkingSet::iterator::operator*() { - dassert(_index < _ws->_data.size() && !isFree()); - return *_ws->_data[_index].member; - } +WorkingSetMember& WorkingSet::iterator::operator*() { + dassert(_index < _ws->_data.size() && !isFree()); + return *_ws->_data[_index].member; +} - WorkingSetMember* WorkingSet::iterator::operator->() { - dassert(_index < _ws->_data.size() && !isFree()); - return _ws->_data[_index].member; - } +WorkingSetMember* WorkingSet::iterator::operator->() { + dassert(_index < _ws->_data.size() && !isFree()); + return _ws->_data[_index].member; +} - WorkingSet::iterator WorkingSet::begin() { - return WorkingSet::iterator(this, 0); - } +WorkingSet::iterator WorkingSet::begin() { + return WorkingSet::iterator(this, 0); +} - WorkingSet::iterator WorkingSet::end() { - return WorkingSet::iterator(this, _data.size()); - } +WorkingSet::iterator WorkingSet::end() { + return WorkingSet::iterator(this, _data.size()); +} - // - // WorkingSetMember - // +// +// WorkingSetMember +// - WorkingSetMember::WorkingSetMember() : state(WorkingSetMember::INVALID), isSuspicious(false) { } +WorkingSetMember::WorkingSetMember() : state(WorkingSetMember::INVALID), isSuspicious(false) {} - WorkingSetMember::~WorkingSetMember() { } +WorkingSetMember::~WorkingSetMember() {} - void WorkingSetMember::clear() { - for (size_t i = 0; i < WSM_COMPUTED_NUM_TYPES; i++) { - _computed[i].reset(); - } - - keyData.clear(); - obj.reset(); - state = WorkingSetMember::INVALID; - } - - bool WorkingSetMember::hasLoc() const { - return state == LOC_AND_IDX || state == LOC_AND_UNOWNED_OBJ || state == LOC_AND_OWNED_OBJ; +void WorkingSetMember::clear() { + for (size_t i = 0; i < WSM_COMPUTED_NUM_TYPES; i++) { + _computed[i].reset(); } - bool WorkingSetMember::hasObj() const { - return hasOwnedObj() || hasUnownedObj(); + keyData.clear(); + obj.reset(); + state = WorkingSetMember::INVALID; +} + +bool WorkingSetMember::hasLoc() const { + return state == LOC_AND_IDX || state == LOC_AND_UNOWNED_OBJ || state == LOC_AND_OWNED_OBJ; +} + +bool WorkingSetMember::hasObj() const { + return hasOwnedObj() || hasUnownedObj(); +} + +bool WorkingSetMember::hasOwnedObj() const { + return state == OWNED_OBJ || state == LOC_AND_OWNED_OBJ; +} + +bool WorkingSetMember::hasUnownedObj() const { + return state == LOC_AND_UNOWNED_OBJ; +} + +bool WorkingSetMember::hasComputed(const WorkingSetComputedDataType type) const { + return _computed[type].get(); +} + +const WorkingSetComputedData* WorkingSetMember::getComputed( + const WorkingSetComputedDataType type) const { + verify(_computed[type]); + return _computed[type].get(); +} + +void WorkingSetMember::addComputed(WorkingSetComputedData* data) { + verify(!hasComputed(data->type())); + _computed[data->type()].reset(data); +} + +void WorkingSetMember::setFetcher(RecordFetcher* fetcher) { + _fetcher.reset(fetcher); +} + +RecordFetcher* WorkingSetMember::releaseFetcher() { + return _fetcher.release(); +} + +bool WorkingSetMember::hasFetcher() const { + return NULL != _fetcher.get(); +} + +bool WorkingSetMember::getFieldDotted(const string& field, BSONElement* out) const { + // If our state is such that we have an object, use it. + if (hasObj()) { + *out = obj.value().getFieldDotted(field); + return true; } - bool WorkingSetMember::hasOwnedObj() const { - return state == OWNED_OBJ || state == LOC_AND_OWNED_OBJ; - } + // Our state should be such that we have index data/are covered. + for (size_t i = 0; i < keyData.size(); ++i) { + BSONObjIterator keyPatternIt(keyData[i].indexKeyPattern); + BSONObjIterator keyDataIt(keyData[i].keyData); - bool WorkingSetMember::hasUnownedObj() const { - return state == LOC_AND_UNOWNED_OBJ; - } - - bool WorkingSetMember::hasComputed(const WorkingSetComputedDataType type) const { - return _computed[type].get(); - } + while (keyPatternIt.more()) { + BSONElement keyPatternElt = keyPatternIt.next(); + verify(keyDataIt.more()); + BSONElement keyDataElt = keyDataIt.next(); - const WorkingSetComputedData* WorkingSetMember::getComputed(const WorkingSetComputedDataType type) const { - verify(_computed[type]); - return _computed[type].get(); + if (field == keyPatternElt.fieldName()) { + *out = keyDataElt; + return true; + } + } } - void WorkingSetMember::addComputed(WorkingSetComputedData* data) { - verify(!hasComputed(data->type())); - _computed[data->type()].reset(data); - } + return false; +} - void WorkingSetMember::setFetcher(RecordFetcher* fetcher) { - _fetcher.reset(fetcher); - } +size_t WorkingSetMember::getMemUsage() const { + size_t memUsage = 0; - RecordFetcher* WorkingSetMember::releaseFetcher() { - return _fetcher.release(); + if (hasLoc()) { + memUsage += sizeof(RecordId); } - bool WorkingSetMember::hasFetcher() const { - return NULL != _fetcher.get(); + // XXX: Unowned objects count towards current size. + // See SERVER-12579 + if (hasObj()) { + memUsage += obj.value().objsize(); } - bool WorkingSetMember::getFieldDotted(const string& field, BSONElement* out) const { - // If our state is such that we have an object, use it. - if (hasObj()) { - *out = obj.value().getFieldDotted(field); - return true; - } - - // Our state should be such that we have index data/are covered. - for (size_t i = 0; i < keyData.size(); ++i) { - BSONObjIterator keyPatternIt(keyData[i].indexKeyPattern); - BSONObjIterator keyDataIt(keyData[i].keyData); - - while (keyPatternIt.more()) { - BSONElement keyPatternElt = keyPatternIt.next(); - verify(keyDataIt.more()); - BSONElement keyDataElt = keyDataIt.next(); - - if (field == keyPatternElt.fieldName()) { - *out = keyDataElt; - return true; - } - } - } - - return false; + for (size_t i = 0; i < keyData.size(); ++i) { + const IndexKeyDatum& keyDatum = keyData[i]; + memUsage += keyDatum.keyData.objsize(); } - size_t WorkingSetMember::getMemUsage() const { - size_t memUsage = 0; - - if (hasLoc()) { - memUsage += sizeof(RecordId); - } - - // XXX: Unowned objects count towards current size. - // See SERVER-12579 - if (hasObj()) { - memUsage += obj.value().objsize(); - } - - for (size_t i = 0; i < keyData.size(); ++i) { - const IndexKeyDatum& keyDatum = keyData[i]; - memUsage += keyDatum.keyData.objsize(); - } - - return memUsage; - } + return memUsage; +} } // namespace mongo diff --git a/src/mongo/db/exec/working_set.h b/src/mongo/db/exec/working_set.h index ac23c2cf667..74080e6f4cc 100644 --- a/src/mongo/db/exec/working_set.h +++ b/src/mongo/db/exec/working_set.h @@ -38,308 +38,311 @@ namespace mongo { - class IndexAccessMethod; - class RecordFetcher; - class WorkingSetMember; +class IndexAccessMethod; +class RecordFetcher; +class WorkingSetMember; - typedef size_t WorkingSetID; +typedef size_t WorkingSetID; + +/** + * All data in use by a query. Data is passed through the stage tree by referencing the ID of + * an element of the working set. Stages can add elements to the working set, delete elements + * from the working set, or mutate elements in the working set. + * + * Concurrency Notes: + * flagForReview() can only be called with a write lock covering the collection this WorkingSet + * is for. All other methods should only be called by the thread owning this WorkingSet while + * holding the read lock covering the collection. + */ +class WorkingSet { + MONGO_DISALLOW_COPYING(WorkingSet); + +public: + static const WorkingSetID INVALID_ID = WorkingSetID(-1); + + WorkingSet(); + ~WorkingSet(); + + /** + * Allocate a new query result and return the ID used to get and free it. + */ + WorkingSetID allocate(); + + /** + * Get the i-th mutable query result. The pointer will be valid for this id until freed. + * Do not delete the returned pointer as the WorkingSet retains ownership. Call free() to + * release it. + */ + WorkingSetMember* get(const WorkingSetID& i) const { + dassert(i < _data.size()); // ID has been allocated. + dassert(_data[i].nextFreeOrSelf == i); // ID currently in use. + return _data[i].member; + } + + /** + * Deallocate the i-th query result and release its resources. + */ + void free(const WorkingSetID& i); /** - * All data in use by a query. Data is passed through the stage tree by referencing the ID of - * an element of the working set. Stages can add elements to the working set, delete elements - * from the working set, or mutate elements in the working set. + * The RecordId in WSM 'i' was invalidated while being processed. Any predicates over the + * WSM could not be fully evaluated, so the WSM may or may not satisfy them. As such, if we + * wish to output the WSM, we must do some clean-up work later. Adds the WSM with id 'i' to + * the list of flagged WSIDs. * - * Concurrency Notes: - * flagForReview() can only be called with a write lock covering the collection this WorkingSet - * is for. All other methods should only be called by the thread owning this WorkingSet while - * holding the read lock covering the collection. + * The WSM must be in the state OWNED_OBJ. */ - class WorkingSet { - MONGO_DISALLOW_COPYING(WorkingSet); - public: - static const WorkingSetID INVALID_ID = WorkingSetID(-1); + void flagForReview(const WorkingSetID& i); - WorkingSet(); - ~WorkingSet(); + /** + * Return true if the provided ID is flagged. + */ + bool isFlagged(WorkingSetID id) const; - /** - * Allocate a new query result and return the ID used to get and free it. - */ - WorkingSetID allocate(); + /** + * Return the set of all WSIDs passed to flagForReview. + */ + const unordered_set<WorkingSetID>& getFlagged() const; - /** - * Get the i-th mutable query result. The pointer will be valid for this id until freed. - * Do not delete the returned pointer as the WorkingSet retains ownership. Call free() to - * release it. - */ - WorkingSetMember* get(const WorkingSetID& i) const { - dassert(i < _data.size()); // ID has been allocated. - dassert(_data[i].nextFreeOrSelf == i); // ID currently in use. - return _data[i].member; - } + /** + * Removes and deallocates all members of this working set. + */ + void clear(); - /** - * Deallocate the i-th query result and release its resources. - */ - void free(const WorkingSetID& i); + // + // Iteration + // - /** - * The RecordId in WSM 'i' was invalidated while being processed. Any predicates over the - * WSM could not be fully evaluated, so the WSM may or may not satisfy them. As such, if we - * wish to output the WSM, we must do some clean-up work later. Adds the WSM with id 'i' to - * the list of flagged WSIDs. - * - * The WSM must be in the state OWNED_OBJ. - */ - void flagForReview(const WorkingSetID& i); + /** + * Forward iterates over the list of working set members, skipping any entries + * that are on the free list. + */ + class iterator { + public: + iterator(WorkingSet* ws, size_t index); - /** - * Return true if the provided ID is flagged. - */ - bool isFlagged(WorkingSetID id) const; + void operator++(); + + bool operator==(const WorkingSet::iterator& other) const; + bool operator!=(const WorkingSet::iterator& other) const; + + WorkingSetMember& operator*(); + + WorkingSetMember* operator->(); /** - * Return the set of all WSIDs passed to flagForReview. + * Free the WSM we are currently pointing to. Does not advance the iterator. + * + * It is invalid to dereference the iterator after calling free until the iterator is + * next incremented. */ - const unordered_set<WorkingSetID>& getFlagged() const; + void free(); + private: /** - * Removes and deallocates all members of this working set. + * Move the iterator forward to the next allocated WSM. */ - void clear(); - - // - // Iteration - // + void advance(); /** - * Forward iterates over the list of working set members, skipping any entries - * that are on the free list. + * Returns true if the MemberHolder currently pointed at by the iterator is free, and + * false if it contains an allocated working set member. */ - class iterator { - public: - iterator(WorkingSet* ws, size_t index); - - void operator++(); - - bool operator==(const WorkingSet::iterator& other) const; - bool operator!=(const WorkingSet::iterator& other) const; + bool isFree() const; - WorkingSetMember& operator*(); + // The working set we're iterating over. Not owned here. + WorkingSet* _ws; - WorkingSetMember* operator->(); - - /** - * Free the WSM we are currently pointing to. Does not advance the iterator. - * - * It is invalid to dereference the iterator after calling free until the iterator is - * next incremented. - */ - void free(); - - private: - /** - * Move the iterator forward to the next allocated WSM. - */ - void advance(); + // The index of the member we're currently pointing at. + size_t _index; + }; - /** - * Returns true if the MemberHolder currently pointed at by the iterator is free, and - * false if it contains an allocated working set member. - */ - bool isFree() const; + WorkingSet::iterator begin(); - // The working set we're iterating over. Not owned here. - WorkingSet* _ws; + WorkingSet::iterator end(); - // The index of the member we're currently pointing at. - size_t _index; - }; +private: + struct MemberHolder { + MemberHolder(); + ~MemberHolder(); - WorkingSet::iterator begin(); + // Free list link if freed. Points to self if in use. + WorkingSetID nextFreeOrSelf; - WorkingSet::iterator end(); + // Owning pointer + WorkingSetMember* member; + }; - private: - struct MemberHolder { - MemberHolder(); - ~MemberHolder(); + // All WorkingSetIDs are indexes into this, except for INVALID_ID. + // Elements are added to _freeList rather than removed when freed. + std::vector<MemberHolder> _data; - // Free list link if freed. Points to self if in use. - WorkingSetID nextFreeOrSelf; + // Index into _data, forming a linked-list using MemberHolder::nextFreeOrSelf as the next + // link. INVALID_ID is the list terminator since 0 is a valid index. + // If _freeList == INVALID_ID, the free list is empty and all elements in _data are in use. + WorkingSetID _freeList; - // Owning pointer - WorkingSetMember* member; - }; + // An insert-only set of WorkingSetIDs that have been flagged for review. + unordered_set<WorkingSetID> _flagged; +}; - // All WorkingSetIDs are indexes into this, except for INVALID_ID. - // Elements are added to _freeList rather than removed when freed. - std::vector<MemberHolder> _data; +/** + * The key data extracted from an index. Keeps track of both the key (currently a BSONObj) and + * the index that provided the key. The index key pattern is required to correctly interpret + * the key. + */ +struct IndexKeyDatum { + IndexKeyDatum(const BSONObj& keyPattern, const BSONObj& key, const IndexAccessMethod* index) + : indexKeyPattern(keyPattern), keyData(key), index(index) {} - // Index into _data, forming a linked-list using MemberHolder::nextFreeOrSelf as the next - // link. INVALID_ID is the list terminator since 0 is a valid index. - // If _freeList == INVALID_ID, the free list is empty and all elements in _data are in use. - WorkingSetID _freeList; + // This is not owned and points into the IndexDescriptor's data. + BSONObj indexKeyPattern; - // An insert-only set of WorkingSetIDs that have been flagged for review. - unordered_set<WorkingSetID> _flagged; - }; + // This is the BSONObj for the key that we put into the index. Owned by us. + BSONObj keyData; - /** - * The key data extracted from an index. Keeps track of both the key (currently a BSONObj) and - * the index that provided the key. The index key pattern is required to correctly interpret - * the key. - */ - struct IndexKeyDatum { - IndexKeyDatum(const BSONObj& keyPattern, const BSONObj& key, const IndexAccessMethod* index) - : indexKeyPattern(keyPattern), - keyData(key), - index(index) { } + const IndexAccessMethod* index; +}; - // This is not owned and points into the IndexDescriptor's data. - BSONObj indexKeyPattern; +/** + * What types of computed data can we have? + */ +enum WorkingSetComputedDataType { + // What's the score of the document retrieved from a $text query? + WSM_COMPUTED_TEXT_SCORE = 0, - // This is the BSONObj for the key that we put into the index. Owned by us. - BSONObj keyData; + // What's the distance from a geoNear query point to the document? + WSM_COMPUTED_GEO_DISTANCE = 1, - const IndexAccessMethod* index; - }; + // The index key used to retrieve the document, for $returnKey query option. + WSM_INDEX_KEY = 2, - /** - * What types of computed data can we have? - */ - enum WorkingSetComputedDataType { - // What's the score of the document retrieved from a $text query? - WSM_COMPUTED_TEXT_SCORE = 0, + // What point (of several possible points) was used to compute the distance to the document + // via geoNear? + WSM_GEO_NEAR_POINT = 3, - // What's the distance from a geoNear query point to the document? - WSM_COMPUTED_GEO_DISTANCE = 1, + // Must be last. + WSM_COMPUTED_NUM_TYPES, +}; - // The index key used to retrieve the document, for $returnKey query option. - WSM_INDEX_KEY = 2, +/** + * Data that is a computed function of a WSM. + */ +class WorkingSetComputedData { + MONGO_DISALLOW_COPYING(WorkingSetComputedData); - // What point (of several possible points) was used to compute the distance to the document - // via geoNear? - WSM_GEO_NEAR_POINT = 3, +public: + WorkingSetComputedData(const WorkingSetComputedDataType type) : _type(type) {} + virtual ~WorkingSetComputedData() {} - // Must be last. - WSM_COMPUTED_NUM_TYPES, - }; + WorkingSetComputedDataType type() const { + return _type; + } - /** - * Data that is a computed function of a WSM. - */ - class WorkingSetComputedData { - MONGO_DISALLOW_COPYING(WorkingSetComputedData); - public: - WorkingSetComputedData(const WorkingSetComputedDataType type) : _type(type) { } - virtual ~WorkingSetComputedData() { } + virtual WorkingSetComputedData* clone() const = 0; - WorkingSetComputedDataType type() const { return _type; } +private: + WorkingSetComputedDataType _type; +}; - virtual WorkingSetComputedData* clone() const = 0; +/** + * The type of the data passed between query stages. In particular: + * + * Index scan stages return a WorkingSetMember in the LOC_AND_IDX state. + * + * Collection scan stages the LOC_AND_UNOWNED_OBJ state. + * + * A WorkingSetMember may have any of the data above. + */ +class WorkingSetMember { + MONGO_DISALLOW_COPYING(WorkingSetMember); - private: - WorkingSetComputedDataType _type; - }; +public: + WorkingSetMember(); + ~WorkingSetMember(); /** - * The type of the data passed between query stages. In particular: - * - * Index scan stages return a WorkingSetMember in the LOC_AND_IDX state. - * - * Collection scan stages the LOC_AND_UNOWNED_OBJ state. - * - * A WorkingSetMember may have any of the data above. + * Reset to an "empty" state. */ - class WorkingSetMember { - MONGO_DISALLOW_COPYING(WorkingSetMember); - public: - WorkingSetMember(); - ~WorkingSetMember(); + void clear(); - /** - * Reset to an "empty" state. - */ - void clear(); - - enum MemberState { - // Initial state. - INVALID, - - // Data is from 1 or more indices. - LOC_AND_IDX, + enum MemberState { + // Initial state. + INVALID, - // Data is from a collection scan, or data is from an index scan and was fetched. - LOC_AND_UNOWNED_OBJ, + // Data is from 1 or more indices. + LOC_AND_IDX, - // RecordId has been invalidated, or the obj doesn't correspond to an on-disk document - // anymore (e.g. is a computed expression). - OWNED_OBJ, + // Data is from a collection scan, or data is from an index scan and was fetched. + LOC_AND_UNOWNED_OBJ, - // Due to a yield, RecordId is no longer protected by the storage engine's transaction - // and may have been invalidated. The object is either identical to the object keyed - // by RecordId, or is an old version of the document stored at RecordId. - // - // Only used by doc-level locking storage engines (not used by MMAP v1). - LOC_AND_OWNED_OBJ, - }; + // RecordId has been invalidated, or the obj doesn't correspond to an on-disk document + // anymore (e.g. is a computed expression). + OWNED_OBJ, + // Due to a yield, RecordId is no longer protected by the storage engine's transaction + // and may have been invalidated. The object is either identical to the object keyed + // by RecordId, or is an old version of the document stored at RecordId. // - // Core attributes - // + // Only used by doc-level locking storage engines (not used by MMAP v1). + LOC_AND_OWNED_OBJ, + }; - RecordId loc; - Snapshotted<BSONObj> obj; - std::vector<IndexKeyDatum> keyData; - MemberState state; + // + // Core attributes + // - // True if this WSM has survived a yield in LOC_AND_IDX state. - // TODO consider replacing by tracking SnapshotIds for IndexKeyDatums. - bool isSuspicious; + RecordId loc; + Snapshotted<BSONObj> obj; + std::vector<IndexKeyDatum> keyData; + MemberState state; - bool hasLoc() const; - bool hasObj() const; - bool hasOwnedObj() const; - bool hasUnownedObj() const; + // True if this WSM has survived a yield in LOC_AND_IDX state. + // TODO consider replacing by tracking SnapshotIds for IndexKeyDatums. + bool isSuspicious; - // - // Computed data - // + bool hasLoc() const; + bool hasObj() const; + bool hasOwnedObj() const; + bool hasUnownedObj() const; - bool hasComputed(const WorkingSetComputedDataType type) const; - const WorkingSetComputedData* getComputed(const WorkingSetComputedDataType type) const; - void addComputed(WorkingSetComputedData* data); + // + // Computed data + // - // - // Fetching - // + bool hasComputed(const WorkingSetComputedDataType type) const; + const WorkingSetComputedData* getComputed(const WorkingSetComputedDataType type) const; + void addComputed(WorkingSetComputedData* data); - void setFetcher(RecordFetcher* fetcher); - // Transfers ownership to the caller. - RecordFetcher* releaseFetcher(); - bool hasFetcher() const; + // + // Fetching + // - /** - * getFieldDotted uses its state (obj or index data) to produce the field with the provided - * name. - * - * Returns true if there is the element is in an index key or in an (owned or unowned) - * object. *out is set to the element if so. - * - * Returns false otherwise. Returning false indicates a query planning error. - */ - bool getFieldDotted(const std::string& field, BSONElement* out) const; + void setFetcher(RecordFetcher* fetcher); + // Transfers ownership to the caller. + RecordFetcher* releaseFetcher(); + bool hasFetcher() const; - /** - * Returns expected memory usage of working set member. - */ - size_t getMemUsage() const; + /** + * getFieldDotted uses its state (obj or index data) to produce the field with the provided + * name. + * + * Returns true if there is the element is in an index key or in an (owned or unowned) + * object. *out is set to the element if so. + * + * Returns false otherwise. Returning false indicates a query planning error. + */ + bool getFieldDotted(const std::string& field, BSONElement* out) const; - private: - std::unique_ptr<WorkingSetComputedData> _computed[WSM_COMPUTED_NUM_TYPES]; + /** + * Returns expected memory usage of working set member. + */ + size_t getMemUsage() const; - std::unique_ptr<RecordFetcher> _fetcher; - }; +private: + std::unique_ptr<WorkingSetComputedData> _computed[WSM_COMPUTED_NUM_TYPES]; + + std::unique_ptr<RecordFetcher> _fetcher; +}; } // namespace mongo diff --git a/src/mongo/db/exec/working_set_common.cpp b/src/mongo/db/exec/working_set_common.cpp index 6f03d1378a1..7fb10dd1792 100644 --- a/src/mongo/db/exec/working_set_common.cpp +++ b/src/mongo/db/exec/working_set_common.cpp @@ -39,170 +39,171 @@ namespace mongo { - // static - bool WorkingSetCommon::fetchAndInvalidateLoc(OperationContext* txn, - WorkingSetMember* member, - const Collection* collection) { - // Already in our desired state. - if (member->state == WorkingSetMember::OWNED_OBJ) { return true; } - - // We can't do anything without a RecordId. - if (!member->hasLoc()) { return false; } - - // Do the fetch, invalidate the DL. - member->obj = collection->docFor(txn, member->loc); - member->obj.setValue(member->obj.value().getOwned() ); - - member->state = WorkingSetMember::OWNED_OBJ; - member->loc = RecordId(); +// static +bool WorkingSetCommon::fetchAndInvalidateLoc(OperationContext* txn, + WorkingSetMember* member, + const Collection* collection) { + // Already in our desired state. + if (member->state == WorkingSetMember::OWNED_OBJ) { return true; } - void WorkingSetCommon::prepareForSnapshotChange(WorkingSet* workingSet) { - dassert(supportsDocLocking()); - - for (WorkingSet::iterator it = workingSet->begin(); it != workingSet->end(); ++it) { - if (it->state == WorkingSetMember::LOC_AND_IDX) { - it->isSuspicious = true; - } - else if (it->state == WorkingSetMember::LOC_AND_UNOWNED_OBJ) { - // We already have the data so convert directly to owned state. - it->obj.setValue(it->obj.value().getOwned()); - it->state = WorkingSetMember::LOC_AND_OWNED_OBJ; - } - } + // We can't do anything without a RecordId. + if (!member->hasLoc()) { + return false; } - // static - bool WorkingSetCommon::fetch(OperationContext* txn, - WorkingSetMember* member, - unowned_ptr<RecordCursor> cursor) { - // The RecordFetcher should already have been transferred out of the WSM and used. - invariant(!member->hasFetcher()); - - // We should have a RecordId but need to retrieve the obj. Get the obj now and reset all WSM - // state appropriately. - invariant(member->hasLoc()); - - member->obj.reset(); - auto record = cursor->seekExact(member->loc); - if (!record) { - return false; - } - - member->obj = {txn->recoveryUnit()->getSnapshotId(), record->data.releaseToBson()}; - - if (member->isSuspicious) { - // Make sure that all of the keyData is still valid for this copy of the document. - // This ensures both that index-provided filters and sort orders still hold. - // TODO provide a way for the query planner to opt out of this checking if it is - // unneeded due to the structure of the plan. - invariant(!member->keyData.empty()); - for (size_t i = 0; i < member->keyData.size(); i++) { - BSONObjSet keys; - member->keyData[i].index->getKeys(member->obj.value(), &keys); - if (!keys.count(member->keyData[i].keyData)) { - // document would no longer be at this position in the index. - return false; - } - } - - member->isSuspicious = false; + // Do the fetch, invalidate the DL. + member->obj = collection->docFor(txn, member->loc); + member->obj.setValue(member->obj.value().getOwned()); + + member->state = WorkingSetMember::OWNED_OBJ; + member->loc = RecordId(); + return true; +} + +void WorkingSetCommon::prepareForSnapshotChange(WorkingSet* workingSet) { + dassert(supportsDocLocking()); + + for (WorkingSet::iterator it = workingSet->begin(); it != workingSet->end(); ++it) { + if (it->state == WorkingSetMember::LOC_AND_IDX) { + it->isSuspicious = true; + } else if (it->state == WorkingSetMember::LOC_AND_UNOWNED_OBJ) { + // We already have the data so convert directly to owned state. + it->obj.setValue(it->obj.value().getOwned()); + it->state = WorkingSetMember::LOC_AND_OWNED_OBJ; } - - member->keyData.clear(); - member->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; - return true; + } +} + +// static +bool WorkingSetCommon::fetch(OperationContext* txn, + WorkingSetMember* member, + unowned_ptr<RecordCursor> cursor) { + // The RecordFetcher should already have been transferred out of the WSM and used. + invariant(!member->hasFetcher()); + + // We should have a RecordId but need to retrieve the obj. Get the obj now and reset all WSM + // state appropriately. + invariant(member->hasLoc()); + + member->obj.reset(); + auto record = cursor->seekExact(member->loc); + if (!record) { + return false; } - // static - void WorkingSetCommon::initFrom(WorkingSetMember* dest, const WorkingSetMember& src) { - dest->loc = src.loc; - dest->obj = src.obj; - dest->keyData = src.keyData; - dest->state = src.state; - - // Merge computed data. - typedef WorkingSetComputedDataType WSCD; - for (WSCD i = WSCD(0); i < WSM_COMPUTED_NUM_TYPES; i = WSCD(i + 1)) { - if (src.hasComputed(i)) { - dest->addComputed(src.getComputed(i)->clone()); + member->obj = {txn->recoveryUnit()->getSnapshotId(), record->data.releaseToBson()}; + + if (member->isSuspicious) { + // Make sure that all of the keyData is still valid for this copy of the document. + // This ensures both that index-provided filters and sort orders still hold. + // TODO provide a way for the query planner to opt out of this checking if it is + // unneeded due to the structure of the plan. + invariant(!member->keyData.empty()); + for (size_t i = 0; i < member->keyData.size(); i++) { + BSONObjSet keys; + member->keyData[i].index->getKeys(member->obj.value(), &keys); + if (!keys.count(member->keyData[i].keyData)) { + // document would no longer be at this position in the index. + return false; } } - } - - // static - BSONObj WorkingSetCommon::buildMemberStatusObject(const Status& status) { - BSONObjBuilder bob; - bob.append("ok", status.isOK() ? 1.0 : 0.0); - bob.append("code", status.code()); - bob.append("errmsg", status.reason()); - return bob.obj(); + member->isSuspicious = false; } - // static - WorkingSetID WorkingSetCommon::allocateStatusMember(WorkingSet* ws, const Status& status) { - invariant(ws); - - WorkingSetID wsid = ws->allocate(); - WorkingSetMember* member = ws->get(wsid); - member->state = WorkingSetMember::OWNED_OBJ; - member->obj = Snapshotted<BSONObj>(SnapshotId(), buildMemberStatusObject(status)); - - return wsid; - } - - // static - bool WorkingSetCommon::isValidStatusMemberObject(const BSONObj& obj) { - return obj.nFields() == 3 && - obj.hasField("ok") && - obj.hasField("code") && - obj.hasField("errmsg"); - } - - // static - void WorkingSetCommon::getStatusMemberObject(const WorkingSet& ws, WorkingSetID wsid, - BSONObj* objOut) { - invariant(objOut); - - // Validate ID and working set member. - if (WorkingSet::INVALID_ID == wsid) { - return; - } - WorkingSetMember* member = ws.get(wsid); - if (!member->hasOwnedObj()) { - return; + member->keyData.clear(); + member->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; + return true; +} + +// static +void WorkingSetCommon::initFrom(WorkingSetMember* dest, const WorkingSetMember& src) { + dest->loc = src.loc; + dest->obj = src.obj; + dest->keyData = src.keyData; + dest->state = src.state; + + // Merge computed data. + typedef WorkingSetComputedDataType WSCD; + for (WSCD i = WSCD(0); i < WSM_COMPUTED_NUM_TYPES; i = WSCD(i + 1)) { + if (src.hasComputed(i)) { + dest->addComputed(src.getComputed(i)->clone()); } - BSONObj obj = member->obj.value(); - if (!isValidStatusMemberObject(obj)) { - return; - } - *objOut = obj; } - - // static - Status WorkingSetCommon::getMemberObjectStatus(const BSONObj& memberObj) { - invariant(WorkingSetCommon::isValidStatusMemberObject(memberObj)); - return Status(static_cast<ErrorCodes::Error>(memberObj["code"].numberInt()), - memberObj["errmsg"]); +} + +// static +BSONObj WorkingSetCommon::buildMemberStatusObject(const Status& status) { + BSONObjBuilder bob; + bob.append("ok", status.isOK() ? 1.0 : 0.0); + bob.append("code", status.code()); + bob.append("errmsg", status.reason()); + + return bob.obj(); +} + +// static +WorkingSetID WorkingSetCommon::allocateStatusMember(WorkingSet* ws, const Status& status) { + invariant(ws); + + WorkingSetID wsid = ws->allocate(); + WorkingSetMember* member = ws->get(wsid); + member->state = WorkingSetMember::OWNED_OBJ; + member->obj = Snapshotted<BSONObj>(SnapshotId(), buildMemberStatusObject(status)); + + return wsid; +} + +// static +bool WorkingSetCommon::isValidStatusMemberObject(const BSONObj& obj) { + return obj.nFields() == 3 && obj.hasField("ok") && obj.hasField("code") && + obj.hasField("errmsg"); +} + +// static +void WorkingSetCommon::getStatusMemberObject(const WorkingSet& ws, + WorkingSetID wsid, + BSONObj* objOut) { + invariant(objOut); + + // Validate ID and working set member. + if (WorkingSet::INVALID_ID == wsid) { + return; } - - // static - Status WorkingSetCommon::getMemberStatus(const WorkingSetMember& member) { - invariant(member.hasObj()); - return getMemberObjectStatus(member.obj.value()); + WorkingSetMember* member = ws.get(wsid); + if (!member->hasOwnedObj()) { + return; } - - // static - std::string WorkingSetCommon::toStatusString(const BSONObj& obj) { - if (!isValidStatusMemberObject(obj)) { - Status unknownStatus(ErrorCodes::UnknownError, "no details available"); - return unknownStatus.toString(); - } - Status status(ErrorCodes::fromInt(obj.getIntField("code")), - obj.getStringField("errmsg")); - return status.toString(); + BSONObj obj = member->obj.value(); + if (!isValidStatusMemberObject(obj)) { + return; + } + *objOut = obj; +} + +// static +Status WorkingSetCommon::getMemberObjectStatus(const BSONObj& memberObj) { + invariant(WorkingSetCommon::isValidStatusMemberObject(memberObj)); + return Status(static_cast<ErrorCodes::Error>(memberObj["code"].numberInt()), + memberObj["errmsg"]); +} + +// static +Status WorkingSetCommon::getMemberStatus(const WorkingSetMember& member) { + invariant(member.hasObj()); + return getMemberObjectStatus(member.obj.value()); +} + +// static +std::string WorkingSetCommon::toStatusString(const BSONObj& obj) { + if (!isValidStatusMemberObject(obj)) { + Status unknownStatus(ErrorCodes::UnknownError, "no details available"); + return unknownStatus.toString(); } + Status status(ErrorCodes::fromInt(obj.getIntField("code")), obj.getStringField("errmsg")); + return status.toString(); +} } // namespace mongo diff --git a/src/mongo/db/exec/working_set_common.h b/src/mongo/db/exec/working_set_common.h index aa1ecdb96c6..2fde57fda3a 100644 --- a/src/mongo/db/exec/working_set_common.h +++ b/src/mongo/db/exec/working_set_common.h @@ -33,104 +33,104 @@ namespace mongo { - class CanonicalQuery; - class Collection; - class OperationContext; - class RecordCursor; - - class WorkingSetCommon { - public: - /** - * Get an owned copy of the BSONObj the WSM refers to. - * Requires either a valid BSONObj or valid RecordId. - * Returns true if the fetch and invalidate succeeded, false otherwise. - */ - static bool fetchAndInvalidateLoc(OperationContext* txn, - WorkingSetMember* member, - const Collection* collection); - - /** - * This must be called as part of "saveState" operations after all nodes in the tree save - * their state. - * - * Iterates over 'workingSet' and converts all LOC_AND_UNOWNED_OBJ members to - * LOC_AND_OWNED_OBJ by calling getOwned on their obj. Also sets the isSuspicious flag on - * all nodes in LOC_AND_IDX state. - */ - static void prepareForSnapshotChange(WorkingSet* workingSet); - - /** - * Retrieves the document corresponding to 'member' from 'collection', and sets the state of - * 'member' appropriately. - * - * If false is returned, the document should not be considered for the result set. It is the - * caller's responsibility to free 'member' in this case. - * - * WriteConflict exceptions may be thrown. When they are, 'member' will be unmodified. - */ - static bool fetch(OperationContext* txn, - WorkingSetMember* member, - unowned_ptr<RecordCursor> cursor); - - static bool fetchIfUnfetched(OperationContext* txn, - WorkingSetMember* member, - unowned_ptr<RecordCursor> cursor) { - if (member->hasObj()) return true; - return fetch(txn, member, cursor); - } - - /** - * Initialize the fields in 'dest' from 'src', creating copies of owned objects as needed. - */ - static void initFrom(WorkingSetMember* dest, const WorkingSetMember& src); - - /** - * Build a BSONObj which represents a Status to return in a WorkingSet. - */ - static BSONObj buildMemberStatusObject(const Status& status); - - /** - * Allocate a new WSM and initialize it with - * the code and reason from the status. - * Owned BSON object will have the following layout: - * { - * ok: <ok>, // 1 for OK; 0 otherwise. - * code: <code>, // Status::code() - * errmsg: <errmsg> // Status::reason() - * } - */ - static WorkingSetID allocateStatusMember(WorkingSet* ws, const Status& status); - - /** - * Returns true if object was created by allocateStatusMember(). - */ - static bool isValidStatusMemberObject(const BSONObj& obj); - - /** - * Returns object in working set member created with allocateStatusMember(). - * Does not assume isValidStatusMemberObject. - * If the WSID is invalid or the working set member is created by - * allocateStatusMember, objOut will not be updated. - */ - static void getStatusMemberObject(const WorkingSet& ws, WorkingSetID wsid, - BSONObj* objOut); - - /** - * Returns status from working set member object. - * Assumes isValidStatusMemberObject(). - */ - static Status getMemberObjectStatus(const BSONObj& memberObj); - - /** - * Returns status from working set member created with allocateStatusMember(). - * Assumes isValidStatusMemberObject(). - */ - static Status getMemberStatus(const WorkingSetMember& member); - - /** - * Formats working set member object created with allocateStatusMember(). - */ - static std::string toStatusString(const BSONObj& obj); - }; +class CanonicalQuery; +class Collection; +class OperationContext; +class RecordCursor; + +class WorkingSetCommon { +public: + /** + * Get an owned copy of the BSONObj the WSM refers to. + * Requires either a valid BSONObj or valid RecordId. + * Returns true if the fetch and invalidate succeeded, false otherwise. + */ + static bool fetchAndInvalidateLoc(OperationContext* txn, + WorkingSetMember* member, + const Collection* collection); + + /** + * This must be called as part of "saveState" operations after all nodes in the tree save + * their state. + * + * Iterates over 'workingSet' and converts all LOC_AND_UNOWNED_OBJ members to + * LOC_AND_OWNED_OBJ by calling getOwned on their obj. Also sets the isSuspicious flag on + * all nodes in LOC_AND_IDX state. + */ + static void prepareForSnapshotChange(WorkingSet* workingSet); + + /** + * Retrieves the document corresponding to 'member' from 'collection', and sets the state of + * 'member' appropriately. + * + * If false is returned, the document should not be considered for the result set. It is the + * caller's responsibility to free 'member' in this case. + * + * WriteConflict exceptions may be thrown. When they are, 'member' will be unmodified. + */ + static bool fetch(OperationContext* txn, + WorkingSetMember* member, + unowned_ptr<RecordCursor> cursor); + + static bool fetchIfUnfetched(OperationContext* txn, + WorkingSetMember* member, + unowned_ptr<RecordCursor> cursor) { + if (member->hasObj()) + return true; + return fetch(txn, member, cursor); + } + + /** + * Initialize the fields in 'dest' from 'src', creating copies of owned objects as needed. + */ + static void initFrom(WorkingSetMember* dest, const WorkingSetMember& src); + + /** + * Build a BSONObj which represents a Status to return in a WorkingSet. + */ + static BSONObj buildMemberStatusObject(const Status& status); + + /** + * Allocate a new WSM and initialize it with + * the code and reason from the status. + * Owned BSON object will have the following layout: + * { + * ok: <ok>, // 1 for OK; 0 otherwise. + * code: <code>, // Status::code() + * errmsg: <errmsg> // Status::reason() + * } + */ + static WorkingSetID allocateStatusMember(WorkingSet* ws, const Status& status); + + /** + * Returns true if object was created by allocateStatusMember(). + */ + static bool isValidStatusMemberObject(const BSONObj& obj); + + /** + * Returns object in working set member created with allocateStatusMember(). + * Does not assume isValidStatusMemberObject. + * If the WSID is invalid or the working set member is created by + * allocateStatusMember, objOut will not be updated. + */ + static void getStatusMemberObject(const WorkingSet& ws, WorkingSetID wsid, BSONObj* objOut); + + /** + * Returns status from working set member object. + * Assumes isValidStatusMemberObject(). + */ + static Status getMemberObjectStatus(const BSONObj& memberObj); + + /** + * Returns status from working set member created with allocateStatusMember(). + * Assumes isValidStatusMemberObject(). + */ + static Status getMemberStatus(const WorkingSetMember& member); + + /** + * Formats working set member object created with allocateStatusMember(). + */ + static std::string toStatusString(const BSONObj& obj); +}; } // namespace mongo diff --git a/src/mongo/db/exec/working_set_computed_data.h b/src/mongo/db/exec/working_set_computed_data.h index 53a74633764..c7dc27ce45d 100644 --- a/src/mongo/db/exec/working_set_computed_data.h +++ b/src/mongo/db/exec/working_set_computed_data.h @@ -32,68 +32,72 @@ namespace mongo { - class TextScoreComputedData : public WorkingSetComputedData { - public: - TextScoreComputedData(double score) - : WorkingSetComputedData(WSM_COMPUTED_TEXT_SCORE), - _score(score) { } - - double getScore() const { return _score; } - - virtual TextScoreComputedData* clone() const { - return new TextScoreComputedData(_score); - } - - private: - double _score; - }; - - class GeoDistanceComputedData : public WorkingSetComputedData { - public: - GeoDistanceComputedData(double dist) - : WorkingSetComputedData(WSM_COMPUTED_GEO_DISTANCE), - _dist(dist) { } - - double getDist() const { return _dist; } - - virtual GeoDistanceComputedData* clone() const { - return new GeoDistanceComputedData(_dist); - } - - private: - double _dist; - }; - - class IndexKeyComputedData : public WorkingSetComputedData { - public: - IndexKeyComputedData(BSONObj key) - : WorkingSetComputedData(WSM_INDEX_KEY), - _key(key.getOwned()) { } - - BSONObj getKey() const { return _key; } - - virtual IndexKeyComputedData* clone() const { - return new IndexKeyComputedData(_key); - } - - private: - BSONObj _key; - }; - - class GeoNearPointComputedData : public WorkingSetComputedData { - public: - GeoNearPointComputedData(BSONObj point) - : WorkingSetComputedData(WSM_GEO_NEAR_POINT), - _point(point.getOwned()) { } - - BSONObj getPoint() const { return _point; } - - virtual GeoNearPointComputedData* clone() const { - return new GeoNearPointComputedData(_point); - } - - private: - BSONObj _point; - }; +class TextScoreComputedData : public WorkingSetComputedData { +public: + TextScoreComputedData(double score) + : WorkingSetComputedData(WSM_COMPUTED_TEXT_SCORE), _score(score) {} + + double getScore() const { + return _score; + } + + virtual TextScoreComputedData* clone() const { + return new TextScoreComputedData(_score); + } + +private: + double _score; +}; + +class GeoDistanceComputedData : public WorkingSetComputedData { +public: + GeoDistanceComputedData(double dist) + : WorkingSetComputedData(WSM_COMPUTED_GEO_DISTANCE), _dist(dist) {} + + double getDist() const { + return _dist; + } + + virtual GeoDistanceComputedData* clone() const { + return new GeoDistanceComputedData(_dist); + } + +private: + double _dist; +}; + +class IndexKeyComputedData : public WorkingSetComputedData { +public: + IndexKeyComputedData(BSONObj key) + : WorkingSetComputedData(WSM_INDEX_KEY), _key(key.getOwned()) {} + + BSONObj getKey() const { + return _key; + } + + virtual IndexKeyComputedData* clone() const { + return new IndexKeyComputedData(_key); + } + +private: + BSONObj _key; +}; + +class GeoNearPointComputedData : public WorkingSetComputedData { +public: + GeoNearPointComputedData(BSONObj point) + : WorkingSetComputedData(WSM_GEO_NEAR_POINT), _point(point.getOwned()) {} + + BSONObj getPoint() const { + return _point; + } + + virtual GeoNearPointComputedData* clone() const { + return new GeoNearPointComputedData(_point); + } + +private: + BSONObj _point; +}; } // namespace mongo diff --git a/src/mongo/db/exec/working_set_test.cpp b/src/mongo/db/exec/working_set_test.cpp index d4d7163831b..ff95717963c 100644 --- a/src/mongo/db/exec/working_set_test.cpp +++ b/src/mongo/db/exec/working_set_test.cpp @@ -41,207 +41,207 @@ using namespace mongo; namespace { - using std::string; - - class WorkingSetFixture : public mongo::unittest::Test { - protected: - void setUp() { - ws.reset(new WorkingSet()); - WorkingSetID id = ws->allocate(); - ASSERT(id != WorkingSet::INVALID_ID); - member = ws->get(id); - ASSERT(NULL != member); - } - - void tearDown() { - ws.reset(); - member = NULL; - } - - std::unique_ptr<WorkingSet> ws; - WorkingSetMember* member; - }; - - TEST_F(WorkingSetFixture, noFieldToGet) { - BSONElement elt; - - // Make sure we're not getting anything out of an invalid WSM. - ASSERT_EQUALS(WorkingSetMember::INVALID, member->state); - ASSERT_FALSE(member->getFieldDotted("foo", &elt)); - - member->state = WorkingSetMember::LOC_AND_IDX; - ASSERT_FALSE(member->getFieldDotted("foo", &elt)); - - // Our state is that of a valid object. The getFieldDotted shouldn't throw; there's - // something to call getFieldDotted on, but there's no field there. - member->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; - ASSERT_TRUE(member->getFieldDotted("foo", &elt)); - - member->state = WorkingSetMember::OWNED_OBJ; - ASSERT_TRUE(member->getFieldDotted("foo", &elt)); +using std::string; + +class WorkingSetFixture : public mongo::unittest::Test { +protected: + void setUp() { + ws.reset(new WorkingSet()); + WorkingSetID id = ws->allocate(); + ASSERT(id != WorkingSet::INVALID_ID); + member = ws->get(id); + ASSERT(NULL != member); } - TEST_F(WorkingSetFixture, getFieldUnowned) { - string fieldName = "x"; - - BSONObj obj = BSON(fieldName << 5); - // Not truthful since the loc is bogus, but the loc isn't accessed anyway... - member->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; - member->obj = Snapshotted<BSONObj>(SnapshotId(), BSONObj(obj.objdata())); - ASSERT_TRUE(obj.isOwned()); - ASSERT_FALSE(member->obj.value().isOwned()); - - // Get out the field we put in. - BSONElement elt; - ASSERT_TRUE(member->getFieldDotted(fieldName, &elt)); - ASSERT_EQUALS(elt.numberInt(), 5); + void tearDown() { + ws.reset(); + member = NULL; } - TEST_F(WorkingSetFixture, getFieldOwned) { - string fieldName = "x"; - - BSONObj obj = BSON(fieldName << 5); - member->obj = Snapshotted<BSONObj>(SnapshotId(), obj); - ASSERT_TRUE(member->obj.value().isOwned()); - member->state = WorkingSetMember::OWNED_OBJ; - BSONElement elt; - ASSERT_TRUE(member->getFieldDotted(fieldName, &elt)); - ASSERT_EQUALS(elt.numberInt(), 5); + std::unique_ptr<WorkingSet> ws; + WorkingSetMember* member; +}; + +TEST_F(WorkingSetFixture, noFieldToGet) { + BSONElement elt; + + // Make sure we're not getting anything out of an invalid WSM. + ASSERT_EQUALS(WorkingSetMember::INVALID, member->state); + ASSERT_FALSE(member->getFieldDotted("foo", &elt)); + + member->state = WorkingSetMember::LOC_AND_IDX; + ASSERT_FALSE(member->getFieldDotted("foo", &elt)); + + // Our state is that of a valid object. The getFieldDotted shouldn't throw; there's + // something to call getFieldDotted on, but there's no field there. + member->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; + ASSERT_TRUE(member->getFieldDotted("foo", &elt)); + + member->state = WorkingSetMember::OWNED_OBJ; + ASSERT_TRUE(member->getFieldDotted("foo", &elt)); +} + +TEST_F(WorkingSetFixture, getFieldUnowned) { + string fieldName = "x"; + + BSONObj obj = BSON(fieldName << 5); + // Not truthful since the loc is bogus, but the loc isn't accessed anyway... + member->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; + member->obj = Snapshotted<BSONObj>(SnapshotId(), BSONObj(obj.objdata())); + ASSERT_TRUE(obj.isOwned()); + ASSERT_FALSE(member->obj.value().isOwned()); + + // Get out the field we put in. + BSONElement elt; + ASSERT_TRUE(member->getFieldDotted(fieldName, &elt)); + ASSERT_EQUALS(elt.numberInt(), 5); +} + +TEST_F(WorkingSetFixture, getFieldOwned) { + string fieldName = "x"; + + BSONObj obj = BSON(fieldName << 5); + member->obj = Snapshotted<BSONObj>(SnapshotId(), obj); + ASSERT_TRUE(member->obj.value().isOwned()); + member->state = WorkingSetMember::OWNED_OBJ; + BSONElement elt; + ASSERT_TRUE(member->getFieldDotted(fieldName, &elt)); + ASSERT_EQUALS(elt.numberInt(), 5); +} + +TEST_F(WorkingSetFixture, getFieldFromIndex) { + string firstName = "x"; + int firstValue = 5; + + string secondName = "y"; + int secondValue = 10; + + member->keyData.push_back(IndexKeyDatum(BSON(firstName << 1), BSON("" << firstValue), NULL)); + // Also a minor lie as loc is bogus. + member->state = WorkingSetMember::LOC_AND_IDX; + BSONElement elt; + ASSERT_TRUE(member->getFieldDotted(firstName, &elt)); + ASSERT_EQUALS(elt.numberInt(), firstValue); + // No foo field. + ASSERT_FALSE(member->getFieldDotted("foo", &elt)); + + // Add another index datum. + member->keyData.push_back(IndexKeyDatum(BSON(secondName << 1), BSON("" << secondValue), NULL)); + ASSERT_TRUE(member->getFieldDotted(secondName, &elt)); + ASSERT_EQUALS(elt.numberInt(), secondValue); + ASSERT_TRUE(member->getFieldDotted(firstName, &elt)); + ASSERT_EQUALS(elt.numberInt(), firstValue); + // Still no foo. + ASSERT_FALSE(member->getFieldDotted("foo", &elt)); +} + +TEST_F(WorkingSetFixture, getDottedFieldFromIndex) { + string firstName = "x.y"; + int firstValue = 5; + + member->keyData.push_back(IndexKeyDatum(BSON(firstName << 1), BSON("" << firstValue), NULL)); + member->state = WorkingSetMember::LOC_AND_IDX; + BSONElement elt; + ASSERT_TRUE(member->getFieldDotted(firstName, &elt)); + ASSERT_EQUALS(elt.numberInt(), firstValue); + ASSERT_FALSE(member->getFieldDotted("x", &elt)); + ASSERT_FALSE(member->getFieldDotted("y", &elt)); +} + +// +// WorkingSet::iterator tests +// + +TEST(WorkingSetIteratorTest, BasicIteratorTest) { + WorkingSet ws; + + WorkingSetID id1 = ws.allocate(); + WorkingSetMember* member1 = ws.get(id1); + member1->state = WorkingSetMember::LOC_AND_IDX; + member1->keyData.push_back(IndexKeyDatum(BSON("a" << 1), BSON("" << 3), NULL)); + + WorkingSetID id2 = ws.allocate(); + WorkingSetMember* member2 = ws.get(id2); + member2->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; + member2->obj = Snapshotted<BSONObj>(SnapshotId(), BSON("a" << 3)); + + int counter = 0; + for (WorkingSet::iterator it = ws.begin(); it != ws.end(); ++it) { + ASSERT(it->state == WorkingSetMember::LOC_AND_IDX || + it->state == WorkingSetMember::LOC_AND_UNOWNED_OBJ); + counter++; } + ASSERT_EQ(counter, 2); +} - TEST_F(WorkingSetFixture, getFieldFromIndex) { - string firstName = "x"; - int firstValue = 5; - - string secondName = "y"; - int secondValue = 10; - - member->keyData.push_back(IndexKeyDatum(BSON(firstName << 1), BSON("" << firstValue), NULL)); - // Also a minor lie as loc is bogus. - member->state = WorkingSetMember::LOC_AND_IDX; - BSONElement elt; - ASSERT_TRUE(member->getFieldDotted(firstName, &elt)); - ASSERT_EQUALS(elt.numberInt(), firstValue); - // No foo field. - ASSERT_FALSE(member->getFieldDotted("foo", &elt)); - - // Add another index datum. - member->keyData.push_back(IndexKeyDatum(BSON(secondName << 1), BSON("" << secondValue), NULL)); - ASSERT_TRUE(member->getFieldDotted(secondName, &elt)); - ASSERT_EQUALS(elt.numberInt(), secondValue); - ASSERT_TRUE(member->getFieldDotted(firstName, &elt)); - ASSERT_EQUALS(elt.numberInt(), firstValue); - // Still no foo. - ASSERT_FALSE(member->getFieldDotted("foo", &elt)); - } +TEST(WorkingSetIteratorTest, EmptyWorkingSet) { + WorkingSet ws; - TEST_F(WorkingSetFixture, getDottedFieldFromIndex) { - string firstName = "x.y"; - int firstValue = 5; - - member->keyData.push_back(IndexKeyDatum(BSON(firstName << 1), BSON("" << firstValue), NULL)); - member->state = WorkingSetMember::LOC_AND_IDX; - BSONElement elt; - ASSERT_TRUE(member->getFieldDotted(firstName, &elt)); - ASSERT_EQUALS(elt.numberInt(), firstValue); - ASSERT_FALSE(member->getFieldDotted("x", &elt)); - ASSERT_FALSE(member->getFieldDotted("y", &elt)); + int counter = 0; + for (WorkingSet::iterator it = ws.begin(); it != ws.end(); ++it) { + counter++; } + ASSERT_EQ(counter, 0); +} - // - // WorkingSet::iterator tests - // +TEST(WorkingSetIteratorTest, EmptyWorkingSetDueToFree) { + WorkingSet ws; - TEST(WorkingSetIteratorTest, BasicIteratorTest) { - WorkingSet ws; + WorkingSetID id = ws.allocate(); + ws.free(id); - WorkingSetID id1 = ws.allocate(); - WorkingSetMember* member1 = ws.get(id1); - member1->state = WorkingSetMember::LOC_AND_IDX; - member1->keyData.push_back(IndexKeyDatum(BSON("a" << 1), BSON("" << 3), NULL)); - - WorkingSetID id2 = ws.allocate(); - WorkingSetMember* member2 = ws.get(id2); - member2->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; - member2->obj = Snapshotted<BSONObj>(SnapshotId(), BSON("a" << 3)); - - int counter = 0; - for (WorkingSet::iterator it = ws.begin(); it != ws.end(); ++it) { - ASSERT(it->state == WorkingSetMember::LOC_AND_IDX || - it->state == WorkingSetMember::LOC_AND_UNOWNED_OBJ); - counter++; - } - ASSERT_EQ(counter, 2); + int counter = 0; + for (WorkingSet::iterator it = ws.begin(); it != ws.end(); ++it) { + counter++; } + ASSERT_EQ(counter, 0); +} - TEST(WorkingSetIteratorTest, EmptyWorkingSet) { - WorkingSet ws; +TEST(WorkingSetIteratorTest, MixedFreeAndInUse) { + WorkingSet ws; - int counter = 0; - for (WorkingSet::iterator it = ws.begin(); it != ws.end(); ++it) { - counter++; - } - ASSERT_EQ(counter, 0); - } + WorkingSetID id1 = ws.allocate(); + WorkingSetID id2 = ws.allocate(); + WorkingSetID id3 = ws.allocate(); - TEST(WorkingSetIteratorTest, EmptyWorkingSetDueToFree) { - WorkingSet ws; + WorkingSetMember* member = ws.get(id2); + member->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; + member->obj = Snapshotted<BSONObj>(SnapshotId(), BSON("a" << 3)); - WorkingSetID id = ws.allocate(); - ws.free(id); + ws.free(id1); + ws.free(id3); - int counter = 0; - for (WorkingSet::iterator it = ws.begin(); it != ws.end(); ++it) { - counter++; - } - ASSERT_EQ(counter, 0); + int counter = 0; + for (WorkingSet::iterator it = ws.begin(); it != ws.end(); ++it) { + ASSERT(it->state == WorkingSetMember::LOC_AND_UNOWNED_OBJ); + counter++; } + ASSERT_EQ(counter, 1); +} - TEST(WorkingSetIteratorTest, MixedFreeAndInUse) { - WorkingSet ws; - - WorkingSetID id1 = ws.allocate(); - WorkingSetID id2 = ws.allocate(); - WorkingSetID id3 = ws.allocate(); +TEST(WorkingSetIteratorTest, FreeWhileIterating) { + WorkingSet ws; - WorkingSetMember* member = ws.get(id2); - member->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; - member->obj = Snapshotted<BSONObj>(SnapshotId(), BSON("a" << 3)); + ws.allocate(); + ws.allocate(); + ws.allocate(); - ws.free(id1); - ws.free(id3); - - int counter = 0; - for (WorkingSet::iterator it = ws.begin(); it != ws.end(); ++it) { - ASSERT(it->state == WorkingSetMember::LOC_AND_UNOWNED_OBJ); - counter++; + // Free the last two members during iteration. + int counter = 0; + for (WorkingSet::iterator it = ws.begin(); it != ws.end(); ++it) { + if (counter > 0) { + it.free(); } - ASSERT_EQ(counter, 1); + counter++; } + ASSERT_EQ(counter, 3); - TEST(WorkingSetIteratorTest, FreeWhileIterating) { - WorkingSet ws; - - ws.allocate(); - ws.allocate(); - ws.allocate(); - - // Free the last two members during iteration. - int counter = 0; - for (WorkingSet::iterator it = ws.begin(); it != ws.end(); ++it) { - if (counter > 0) { - it.free(); - } - counter++; - } - ASSERT_EQ(counter, 3); - - // Verify that only one item remains in the working set. - counter = 0; - for (WorkingSet::iterator it = ws.begin(); it != ws.end(); ++it) { - counter++; - } - ASSERT_EQ(counter, 1); + // Verify that only one item remains in the working set. + counter = 0; + for (WorkingSet::iterator it = ws.begin(); it != ws.end(); ++it) { + counter++; } + ASSERT_EQ(counter, 1); +} } // namespace |