/** * Copyright (C) 2018-present MongoDB, Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the Server Side Public License, version 1, * as published by MongoDB, Inc. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * Server Side Public License for more details. * * You should have received a copy of the Server Side Public License * along with this program. If not, see * . * * As a special exception, the copyright holders give permission to link the * code of portions of this program with the OpenSSL library under certain * conditions as described in each individual source file and distribute * linked combinations including the program with the OpenSSL library. You * must comply with the Server Side Public License in all respects for * all of the code used other than as permitted herein. If you modify file(s) * with this exception, you may extend this exception to your version of the * file(s), but you are not obligated to do so. If you do not wish to do so, * delete this exception statement from your version. If you delete this * exception statement from all source files in the program, then also delete * it in the license file. */ #pragma once #include #include #include "mongo/db/exec/plan_stats.h" #include "mongo/db/exec/working_set.h" namespace mongo { class ClockSource; class Collection; class OperationContext; class RecordId; /** * A PlanStage ("stage") is the basic building block of a "Query Execution Plan." A stage is * the smallest piece of machinery used in executing a compiled query. Stages either access * data (from a collection or an index) to create a stream of results, or transform a stream of * results (e.g. AND, OR, SORT) to create a stream of results. * * Stages have zero or more input streams but only one output stream. Data-accessing stages are * leaves and data-transforming stages have children. Stages can be connected together to form * a tree which is then executed (see plan_executor.h) to solve a query. * * A stage's input and output are each typed. Only stages with compatible types can be * connected. * * All of the stages of a QEP share a WorkingSet (see working_set.h). Data source stages * allocate a slot in the WorkingSet, fill the slot with data, and return the ID of that slot. * Subsequent stages fetch a WorkingSetElement by its ID and operate on the enclosed data. * * Stages do nothing unless work() is called. work() is a request to the stage to consume one * unit of input. Some stages (e.g. AND, SORT) require many calls to work() before generating * output as they must consume many units of input. These stages will inform the caller that * they need more time, and work() must be called again in order to produce an output. * * Every stage of a query implements the PlanStage interface. Queries perform a unit of work * and report on their subsequent status; see StatusCode for possible states. Query results are * passed through the WorkingSet interface; see working_set.h for details. * * All synchronization is the responsibility of the caller. Queries must be told to yield with * saveState() if any underlying database state changes. If saveState() is called, * restoreState() must be called again before any work() is done. * * Here is a very simple usage example: * * WorkingSet workingSet; * PlanStage* rootStage = makeQueryPlan(&workingSet, ...); * while (!rootStage->isEOF()) { * WorkingSetID result; * switch(rootStage->work(&result)) { * case PlanStage::ADVANCED: * // do something with result * WorkingSetMember* member = workingSet.get(result); * cout << "Result: " << member->obj << std::endl; * break; * case PlanStage::IS_EOF: * // All done. Will fall out of while loop. * break; * case PlanStage::NEED_TIME: * // Need more time. * break; * case PlanStage::FAILURE: * // Throw exception or return error * break; * } * * if (shouldYield) { * // Occasionally yield. * stage->saveState(); * // Do work that requires a yield here (execute other plans, insert, delete, etc.). * stage->restoreState(); * } * } */ class PlanStage { public: PlanStage(const char* typeName, OperationContext* opCtx) : _commonStats(typeName), _opCtx(opCtx) {} protected: /** * Obtain a PlanStage given a child stage. Called during the construction of derived * PlanStage types with a single direct descendant. */ PlanStage(OperationContext* opCtx, std::unique_ptr child, const char* typeName) : PlanStage(typeName, opCtx) { _children.push_back(std::move(child)); } public: virtual ~PlanStage() {} using Children = std::vector>; /** * All possible return values of work(...) */ enum StageState { // work(...) has returned a new result in its out parameter. The caller must free it // from the working set when done with it. ADVANCED, // work(...) won't do anything more. isEOF() will also be true. There is nothing // output in the out parameter. IS_EOF, // work(...) needs more time to product a result. Call work(...) again. There is // nothing output in the out parameter. NEED_TIME, // The storage engine says we need to yield, possibly to fetch a record from disk, or // due to an aborted transaction in the storage layer. // // Full yield request semantics: // // Each stage that receives a NEED_YIELD from a child must propagate the NEED_YIELD up // and perform no work. // // If a yield is requested due to a WriteConflict, the out parameter of work(...) should // be populated with WorkingSet::INVALID_ID. If it is illegal to yield, a // WriteConflictException will be thrown. // // A yield-requesting stage populates the out parameter of work(...) with a WSID that // refers to a WSM with a Fetcher*. If it is illegal to yield, this is ignored. This // difference in behavior can be removed once SERVER-16051 is resolved. // // The plan executor is responsible for yielding and, if requested, paging in the data // upon receipt of a NEED_YIELD. The plan executor does NOT free the WSID of the // requested fetch. The stage that requested the fetch holds the WSID of the loc it // wants fetched. On the next call to work() that stage can assume a fetch was performed // on the WSM that the held WSID refers to. NEED_YIELD, // Something has gone unrecoverably wrong. Stop running this query. // If the out parameter does not refer to an invalid working set member, // call WorkingSetCommon::getStatusMemberObject() to get details on the failure. // Any class implementing this interface must set the WSID out parameter to // INVALID_ID or a valid WSM ID if FAILURE is returned. FAILURE, }; static std::string stateStr(const StageState& state) { if (ADVANCED == state) { return "ADVANCED"; } else if (IS_EOF == state) { return "IS_EOF"; } else if (NEED_TIME == state) { return "NEED_TIME"; } else if (NEED_YIELD == state) { return "NEED_YIELD"; } else { verify(FAILURE == state); return "FAILURE"; } } /** * Perform a unit of work on the query. Ask the stage to produce the next unit of output. * Stage returns StageState::ADVANCED if *out is set to the next unit of output. Otherwise, * returns another value of StageState to indicate the stage's status. */ StageState work(WorkingSetID* out); /** * Returns true if no more work can be done on the query / out of results. */ virtual bool isEOF() = 0; // // Yielding and isolation semantics: // // Any data that is not inserted, deleted, or modified during a yield will be faithfully // returned by a query that should return that data. // // Any data inserted, deleted, or modified during a yield that should be returned by a query // may or may not be returned by that query. The query could return: nothing; the data // before; the data after; or both the data before and the data after. // // In short, there is no isolation between a query and an insert/delete/update. AKA, // READ_UNCOMMITTED. // /** * Notifies the stage that the underlying data source may change. * * It is illegal to call work() or isEOF() when a stage is in the "saved" state. * * Propagates to all children, then calls doSaveState(). */ void saveState(); /** * Notifies the stage that underlying data is stable again and prepares for calls to work(). * * Can only be called while the stage in is the "saved" state. * * Propagates to all children, then calls doRestoreState(). * * Throws a UserException on failure to restore due to a conflicting event such as a * collection drop. May throw a WriteConflictException, in which case the caller may choose to * retry. */ void restoreState(); /** * Detaches from the OperationContext and releases any storage-engine state. * * It is only legal to call this when in a "saved" state. While in the "detached" state, it is * only legal to call reattachToOperationContext or the destructor. It is not legal to call * detachFromOperationContext() while already in the detached state. * * Propagates to all children, then calls doDetachFromOperationContext(). */ void detachFromOperationContext(); /** * Reattaches to the OperationContext and reacquires any storage-engine state. * * It is only legal to call this in the "detached" state. On return, the cursor is left in a * "saved" state, so callers must still call restoreState to use this object. * * Propagates to all children, then calls doReattachToOperationContext(). */ void reattachToOperationContext(OperationContext* opCtx); /* * Releases any resources held by this stage. It is an error to use a PlanStage in any way after * calling dispose(). Does not throw exceptions. * * Propagates to all children, then calls doDispose(). */ void dispose(OperationContext* opCtx) { try { // We may or may not be attached during disposal. We can't call // reattachToOperationContext() // directly, since that will assert that '_opCtx' is not set. _opCtx = opCtx; invariant(!_opCtx || opCtx == opCtx); for (auto&& child : _children) { child->dispose(opCtx); } doDispose(); } catch (...) { std::terminate(); } } /** * Retrieve a list of this stage's children. This stage keeps ownership of * its children. */ const Children& getChildren() const { return _children; } /** * Returns the only child. * * Convenience method for PlanStages that have exactly one child. */ const std::unique_ptr& child() const { dassert(_children.size() == 1); return _children.front(); } /** * What type of stage is this? */ virtual StageType stageType() const = 0; // // Execution stats. // /** * Returns a tree of stats. See plan_stats.h for the details of this structure. If the * stage has any children it must propagate the request for stats to them. * * Creates plan stats tree which has the same topology as the original execution tree, * but has a separate lifetime. */ virtual std::unique_ptr getStats() = 0; /** * Get the CommonStats for this stage. The pointer is *not* owned by the caller. * * The returned pointer is only valid when the corresponding stage is also valid. * It must not exist past the stage. If you need the stats to outlive the stage, * use the getStats(...) method above. */ const CommonStats* getCommonStats() const { return &_commonStats; } /** * Get stats specific to this stage. Some stages may not have specific stats, in which * case they return NULL. The pointer is *not* owned by the caller. * * The returned pointer is only valid when the corresponding stage is also valid. * It must not exist past the stage. If you need the stats to outlive the stage, * use the getStats(...) method above. */ virtual const SpecificStats* getSpecificStats() const = 0; protected: /** * Performs one unit of work. See comment at work() above. */ virtual StageState doWork(WorkingSetID* out) = 0; /** * Saves any stage-specific state required to resume where it was if the underlying data * changes. * * Stages must be able to handle multiple calls to doSaveState() in a row without a call to * doRestoreState() in between. */ virtual void doSaveState() {} /** * Restores any stage-specific saved state and prepares to handle calls to work(). */ virtual void doRestoreState() {} /** * Does stage-specific detaching. * * Implementations of this method cannot use the pointer returned from getOpCtx(). */ virtual void doDetachFromOperationContext() {} /** * Does stage-specific attaching. * * If an OperationContext* is needed, use getOpCtx(), which will return a valid * OperationContext* (the one to which the stage is reattaching). */ virtual void doReattachToOperationContext() {} /** * Does stage-specific destruction. Must not throw exceptions. */ virtual void doDispose() {} ClockSource* getClock() const; OperationContext* getOpCtx() const { return _opCtx; } Children _children; CommonStats _commonStats; private: OperationContext* _opCtx; }; } // namespace mongo