summaryrefslogtreecommitdiff
path: root/src/mongo/db/exec/plan_stage.h
blob: c1a232b3b2c2df697339bb01220bdb9348cf89c8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
/**
 *    Copyright (C) 2013 10gen Inc.
 *
 *    This program is free software: you can redistribute it and/or  modify
 *    it under the terms of the GNU Affero General Public License, version 3,
 *    as published by the Free Software Foundation.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU Affero General Public License for more details.
 *
 *    You should have received a copy of the GNU Affero General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#pragma once

#include "mongo/db/exec/working_set.h"

namespace mongo {

    class DiskLoc;

    /**
     * A PlanStage ("stage") is the basic building block of a "Query Execution Plan."  A stage is
     * the smallest piece of machinery used in executing a compiled query.  Stages either access
     * data (from a collection or an index) to create a stream of results, or transform a stream of
     * results (e.g. AND, OR, SORT) to create a stream of results.  
     *
     * Stages have zero or more input streams but only one output stream.  Data-accessing stages are
     * leaves and data-transforming stages have children.  Stages can be connected together to form
     * a tree which is then executed (see plan_runner.h) to solve a query.  
     *
     * A stage's input and output are each typed.  Only stages with compatible types can be
     * connected.
     *
     * All of the stages of a QEP share a WorkingSet (see working_set.h).  Data source stages
     * allocate a slot in the WorkingSet, fill the slot with data, and return the ID of that slot.
     * Subsequent stages fetch a WorkingSetElement by its ID and operate on the enclosed data.
     *
     * Stages do nothing unless work() is called.  work() is a request to the stage to consume one
     * unit of input.  Some stages (e.g. AND, SORT) require many calls to work() before generating
     * output as they must consume many units of input.  These stages will inform the caller that
     * they need more time, and work() must be called again in order to produce an output.
     *
     * Every stage of a query implements the PlanStage interface.  Queries perform a unit of work
     * and report on their subsequent status; see StatusCode for possible states.  Query results are
     * passed through the WorkingSet interface; see working_set.h for details.
     *
     * All synchronization is the responsibility of the caller.  Queries must be told to yield with
     * prepareToYield() if any underlying database state changes.  If prepareToYield() is called,
     * recoverFromYield() must be called again before any work() is done.
     *
     * Here is a very simple usage example:
     *
     * WorkingSet workingSet;
     * PlanStage* rootStage = makeQueryPlan(&workingSet, ...);
     * while (!rootStage->isEOF()) {
     *     WorkingSetID result;
     *     switch(rootStage->work(&result)) {
     *     case PlanStage::ADVANCED:
     *         // do something with result
     *         WorkingSetMember* member = workingSet.get(result);
     *         cout << "Result: " << member->obj << endl;
     *         break;
     *     case PlanStage::IS_EOF:
     *         // All done.  Will fall out of while loop.
     *         break;
     *     case PlanStage::NEED_TIME:
     *         // Need more time.
     *         break;
     *     case PlanStage::FAILURE:
     *         // Throw exception or return error
     *         break;
     *     case PlanStage::NEED_FETCH:
     *         // Go to disk and fetch stuff.
     *         break;
     *     }
     *
     *     if (shouldYield) {
     *         // Occasionally yield.
     *         stage->prepareToYield();
     *         // Do work that requires a yield here (execute other plans, insert, delete, etc.).
     *         stage->recoverFromYield();
     *     }
     * }
     */
    class PlanStage {
    public:
        virtual ~PlanStage() { }

        /**
         * All possible return values of work(...)
         */
        enum StageState {
            // work(...) has returned a new result in its out parameter.
            ADVANCED,
            // work(...) won't do anything more.  isEOF() will also be true.
            IS_EOF,
            // work(...) needs more time to product a result.  Call work(...) again.
            NEED_TIME,
            // Something has gone unrecoverably wrong.  Stop running this query.
            FAILURE,
            // Something isn't in memory.  Fetch it.  TODO: actually support this (forthcoming).
            NEED_YIELD,
        };

        /**
         * Perform a unit of work on the query.  Ask the stage to produce the next unit of output.
         * Stage returns StageState::ADVANCED if *out is set to the next unit of output.  Otherwise,
         * returns another value of StageState to indicate the stage's status.
         */
        virtual StageState work(WorkingSetID* out) = 0;

        /**
         * Returns true if no more work can be done on the query / out of results.
         */
        virtual bool isEOF() = 0;

        //
        // Yielding and isolation semantics:
        //
        // Any data that is not inserted, deleted, or modified during a yield will be faithfully
        // returned by a query that should return that data.
        //
        // Any data inserted, deleted, or modified during a yield that should be returned by a query
        // may or may not be returned by that query.  The query could return: nothing; the data
        // before; the data after; or both the data before and the data after.
        //
        // In short, there is no isolation between a query and an insert/delete/update.  AKA,
        // READ_UNCOMMITTED.
        //

        /**
         * Notifies the stage that all locks are about to be released.  The stage must save any
         * state required to resume where it was before prepareToYield was called.
         */
        virtual void prepareToYield() = 0;

        /**
         * Notifies the stage that any required locks have been reacquired.  The stage must restore
         * any saved state and be ready to handle calls to work().
         *
         * Can only be called after prepareToYield.
         */
        virtual void recoverFromYield() = 0;

        /**
         * Notifies a stage that a DiskLoc is going to be deleted (or in-place updated) so that the
         * stage can invalidate or modify any state required to continue processing without this
         * DiskLoc.
         *
         * Can only be called after a prepareToYield but before a recoverFromYield.
         */
        virtual void invalidate(const DiskLoc& dl) = 0;
    };

}  // namespace mongo