/** * Copyright (C) 2018-present MongoDB, Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the Server Side Public License, version 1, * as published by MongoDB, Inc. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * Server Side Public License for more details. * * You should have received a copy of the Server Side Public License * along with this program. If not, see * . * * As a special exception, the copyright holders give permission to link the * code of portions of this program with the OpenSSL library under certain * conditions as described in each individual source file and distribute * linked combinations including the program with the OpenSSL library. You * must comply with the Server Side Public License in all respects for * all of the code used other than as permitted herein. If you modify file(s) * with this exception, you may extend this exception to your version of the * file(s), but you are not obligated to do so. If you do not wish to do so, * delete this exception statement from your version. If you delete this * exception statement from all source files in the program, then also delete * it in the license file. */ #include "mongo/db/exec/and_hash.h" #include "mongo/db/exec/and_common-inl.h" #include "mongo/db/exec/scoped_timer.h" #include "mongo/db/exec/working_set.h" #include "mongo/db/exec/working_set_common.h" #include "mongo/stdx/memory.h" #include "mongo/util/mongoutils/str.h" namespace { // Upper limit for buffered data. // Stage execution will fail once size of all buffered data exceeds this threshold. const size_t kDefaultMaxMemUsageBytes = 32 * 1024 * 1024; } // namespace namespace mongo { using std::unique_ptr; using std::vector; using stdx::make_unique; const size_t AndHashStage::kLookAheadWorks = 10; // static const char* AndHashStage::kStageType = "AND_HASH"; AndHashStage::AndHashStage(OperationContext* opCtx, WorkingSet* ws, const Collection* collection) : PlanStage(kStageType, opCtx), _collection(collection), _ws(ws), _hashingChildren(true), _currentChild(0), _memUsage(0), _maxMemUsage(kDefaultMaxMemUsageBytes) {} AndHashStage::AndHashStage(OperationContext* opCtx, WorkingSet* ws, const Collection* collection, size_t maxMemUsage) : PlanStage(kStageType, opCtx), _collection(collection), _ws(ws), _hashingChildren(true), _currentChild(0), _memUsage(0), _maxMemUsage(maxMemUsage) {} void AndHashStage::addChild(PlanStage* child) { _children.emplace_back(child); } size_t AndHashStage::getMemUsage() const { return _memUsage; } bool AndHashStage::isEOF() { // This is empty before calling work() and not-empty after. if (_lookAheadResults.empty()) { return false; } // Either we're busy hashing children, in which case we're not done yet. if (_hashingChildren) { return false; } // Or we're streaming in results from the last child. // If there's nothing to probe against, we're EOF. if (_dataMap.empty()) { return true; } // Otherwise, we're done when the last child is done. invariant(_children.size() >= 2); return (WorkingSet::INVALID_ID == _lookAheadResults[_children.size() - 1]) && _children[_children.size() - 1]->isEOF(); } PlanStage::StageState AndHashStage::doWork(WorkingSetID* out) { if (isEOF()) { return PlanStage::IS_EOF; } // Fast-path for one of our children being EOF immediately. We work each child a few times. // If it hits EOF, the AND cannot output anything. If it produces a result, we stash that // result in _lookAheadResults. if (_lookAheadResults.empty()) { // INVALID_ID means that the child didn't produce a valid result. // We specifically are not using .resize(size, value) here because C++11 builds don't // seem to resolve WorkingSet::INVALID_ID during linking. _lookAheadResults.resize(_children.size()); for (size_t i = 0; i < _children.size(); ++i) { _lookAheadResults[i] = WorkingSet::INVALID_ID; } // Work each child some number of times until it's either EOF or produces // a result. If it's EOF this whole stage will be EOF. If it produces a // result we cache it for later. for (size_t i = 0; i < _children.size(); ++i) { auto& child = _children[i]; for (size_t j = 0; j < kLookAheadWorks; ++j) { // Cache the result in _lookAheadResults[i]. StageState childStatus = child->work(&_lookAheadResults[i]); if (PlanStage::IS_EOF == childStatus) { // A child went right to EOF. Bail out. _hashingChildren = false; _dataMap.clear(); return PlanStage::IS_EOF; } else if (PlanStage::ADVANCED == childStatus) { // Ensure that the BSONObj underlying the WorkingSetMember is owned in case we // yield. _ws->get(_lookAheadResults[i])->makeObjOwnedIfNeeded(); break; // Stop looking at this child. } else if (PlanStage::FAILURE == childStatus || PlanStage::DEAD == childStatus) { // The stage which produces a failure is responsible for allocating a working // set member with error details. invariant(WorkingSet::INVALID_ID != _lookAheadResults[i]); *out = _lookAheadResults[i]; _hashingChildren = false; _dataMap.clear(); return childStatus; } // We ignore NEED_TIME. TODO: what do we want to do if we get NEED_YIELD here? } } // We did a bunch of work above, return NEED_TIME to be fair. return PlanStage::NEED_TIME; } // An AND is either reading the first child into the hash table, probing against the hash // table with subsequent children, or checking the last child's results to see if they're // in the hash table. // We read the first child into our hash table. if (_hashingChildren) { // Check memory usage of previously hashed results. if (_memUsage > _maxMemUsage) { mongoutils::str::stream ss; ss << "hashed AND stage buffered data usage of " << _memUsage << " bytes exceeds internal limit of " << kDefaultMaxMemUsageBytes << " bytes"; Status status(ErrorCodes::Overflow, ss); *out = WorkingSetCommon::allocateStatusMember(_ws, status); return PlanStage::FAILURE; } if (0 == _currentChild) { return readFirstChild(out); } else if (_currentChild < _children.size() - 1) { return hashOtherChildren(out); } else { _hashingChildren = false; // We don't hash our last child. Instead, we probe the table created from the // previous children, returning results in the order of the last child. // Fall through to below. } } // Returning results. We read from the last child and return the results that are in our // hash map. // We should be EOF if we're not hashing results and the dataMap is empty. verify(!_dataMap.empty()); // We probe _dataMap with the last child. verify(_currentChild == _children.size() - 1); // Get the next result for the (_children.size() - 1)-th child. StageState childStatus = workChild(_children.size() - 1, out); if (PlanStage::ADVANCED != childStatus) { return childStatus; } // We know that we've ADVANCED. See if the WSM is in our table. WorkingSetMember* member = _ws->get(*out); // The child must give us a WorkingSetMember with a record id, since we intersect index keys // based on the record id. The planner ensures that the child stage can never produce an WSM // with no record id. invariant(member->hasRecordId()); DataMap::iterator it = _dataMap.find(member->recordId); if (_dataMap.end() == it) { // Child's output wasn't in every previous child. Throw it out. _ws->free(*out); return PlanStage::NEED_TIME; } else { // Child's output was in every previous child. Merge any key data in // the child's output and free the child's just-outputted WSM. WorkingSetID hashID = it->second; _dataMap.erase(it); AndCommon::mergeFrom(_ws, hashID, *member); _ws->free(*out); *out = hashID; return PlanStage::ADVANCED; } } PlanStage::StageState AndHashStage::workChild(size_t childNo, WorkingSetID* out) { if (WorkingSet::INVALID_ID != _lookAheadResults[childNo]) { *out = _lookAheadResults[childNo]; _lookAheadResults[childNo] = WorkingSet::INVALID_ID; return PlanStage::ADVANCED; } else { return _children[childNo]->work(out); } } PlanStage::StageState AndHashStage::readFirstChild(WorkingSetID* out) { verify(_currentChild == 0); WorkingSetID id = WorkingSet::INVALID_ID; StageState childStatus = workChild(0, &id); if (PlanStage::ADVANCED == childStatus) { WorkingSetMember* member = _ws->get(id); // The child must give us a WorkingSetMember with a record id, since we intersect index keys // based on the record id. The planner ensures that the child stage can never produce an WSM // with no record id. invariant(member->hasRecordId()); if (!_dataMap.insert(std::make_pair(member->recordId, id)).second) { // Didn't insert because we already had this RecordId inside the map. This should only // happen if we're seeing a newer copy of the same doc in a more recent snapshot. // Throw out the newer copy of the doc. _ws->free(id); return PlanStage::NEED_TIME; } // Ensure that the BSONObj underlying the WorkingSetMember is owned in case we yield. member->makeObjOwnedIfNeeded(); // Update memory stats. _memUsage += member->getMemUsage(); return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == childStatus) { // Done reading child 0. _currentChild = 1; // If our first child was empty, don't scan any others, no possible results. if (_dataMap.empty()) { _hashingChildren = false; return PlanStage::IS_EOF; } _specificStats.mapAfterChild.push_back(_dataMap.size()); return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == childStatus || PlanStage::DEAD == childStatus) { // The stage which produces a failure is responsible for allocating a working set member // with error details. invariant(WorkingSet::INVALID_ID != id); *out = id; return childStatus; } else { if (PlanStage::NEED_YIELD == childStatus) { *out = id; } return childStatus; } } PlanStage::StageState AndHashStage::hashOtherChildren(WorkingSetID* out) { verify(_currentChild > 0); WorkingSetID id = WorkingSet::INVALID_ID; StageState childStatus = workChild(_currentChild, &id); if (PlanStage::ADVANCED == childStatus) { WorkingSetMember* member = _ws->get(id); // The child must give us a WorkingSetMember with a record id, since we intersect index keys // based on the record id. The planner ensures that the child stage can never produce an // WSM with no record id. invariant(member->hasRecordId()); if (_dataMap.end() == _dataMap.find(member->recordId)) { // Ignore. It's not in any previous child. } else { // We have a hit. Copy data into the WSM we already have. _seenMap.insert(member->recordId); WorkingSetID olderMemberID = _dataMap[member->recordId]; WorkingSetMember* olderMember = _ws->get(olderMemberID); size_t memUsageBefore = olderMember->getMemUsage(); AndCommon::mergeFrom(_ws, olderMemberID, *member); // Update memory stats. _memUsage += olderMember->getMemUsage() - memUsageBefore; } _ws->free(id); return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == childStatus) { // Finished with a child. ++_currentChild; // Keep elements of _dataMap that are in _seenMap. DataMap::iterator it = _dataMap.begin(); while (it != _dataMap.end()) { if (_seenMap.end() == _seenMap.find(it->first)) { DataMap::iterator toErase = it; ++it; // Update memory stats. WorkingSetMember* member = _ws->get(toErase->second); _memUsage -= member->getMemUsage(); _ws->free(toErase->second); _dataMap.erase(toErase); } else { ++it; } } _specificStats.mapAfterChild.push_back(_dataMap.size()); _seenMap.clear(); // _dataMap is now the intersection of the first _currentChild nodes. // If we have nothing to AND with after finishing any child, stop. if (_dataMap.empty()) { _hashingChildren = false; return PlanStage::IS_EOF; } // We've finished scanning all children. Return results with the next call to work(). if (_currentChild == _children.size()) { _hashingChildren = false; } return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == childStatus || PlanStage::DEAD == childStatus) { // The stage which produces a failure is responsible for allocating a working set member // with error details. invariant(WorkingSet::INVALID_ID != id); *out = id; return childStatus; } else { if (PlanStage::NEED_YIELD == childStatus) { *out = id; } return childStatus; } } unique_ptr AndHashStage::getStats() { _commonStats.isEOF = isEOF(); _specificStats.memLimit = _maxMemUsage; _specificStats.memUsage = _memUsage; unique_ptr ret = make_unique(_commonStats, STAGE_AND_HASH); ret->specific = make_unique(_specificStats); for (size_t i = 0; i < _children.size(); ++i) { ret->children.emplace_back(_children[i]->getStats()); } return ret; } const SpecificStats* AndHashStage::getSpecificStats() const { return &_specificStats; } } // namespace mongo