summaryrefslogtreecommitdiff
path: root/src/mongo/db/exec/near.h
blob: 94d9639b3127b657e25ed38456d18311fd8408f7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
/**
 *    Copyright (C) 2018-present MongoDB, Inc.
 *
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the Server Side Public License, version 1,
 *    as published by MongoDB, Inc.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    Server Side Public License for more details.
 *
 *    You should have received a copy of the Server Side Public License
 *    along with this program. If not, see
 *    <http://www.mongodb.com/licensing/server-side-public-license>.
 *
 *    As a special exception, the copyright holders give permission to link the
 *    code of portions of this program with the OpenSSL library under certain
 *    conditions as described in each individual source file and distribute
 *    linked combinations including the program with the OpenSSL library. You
 *    must comply with the Server Side Public License in all respects for
 *    all of the code used other than as permitted herein. If you modify file(s)
 *    with this exception, you may extend this exception to your version of the
 *    file(s), but you are not obligated to do so. If you do not wish to do so,
 *    delete this exception statement from your version. If you delete this
 *    exception statement from all source files in the program, then also delete
 *    it in the license file.
 */

#pragma once

#include <memory>
#include <queue>
#include <vector>

#include "mongo/base/status_with.h"
#include "mongo/base/string_data.h"
#include "mongo/db/catalog/collection.h"
#include "mongo/db/exec/plan_stats.h"
#include "mongo/db/exec/requires_index_stage.h"
#include "mongo/db/exec/working_set.h"
#include "mongo/db/jsobj.h"
#include "mongo/db/record_id.h"
#include "mongo/stdx/unordered_map.h"

namespace mongo {

/**
 * An abstract stage which uses a progressive sort to return results sorted by distance.  This
 * is useful when we do not have a full ordering computed over the distance metric and don't
 * want to generate one.
 *
 * Some parts of the geoNear implementation depend on the type of index being used, so
 * subclasses need to implement these three functions:
 *
 * - initialize() - Prepares the stage to begin the geoNear search. Must return IS_EOF iff the
 *                  stage is prepared to begin buffering documents.
 * - nextInterval() - Must return the bounds of the next interval with a PlanStage that will find
 *                    all of the results in this interval that have not already been buffered in
 *                    previous intervals.
 * - computeDistance() - Must return the distance from a document to the centroid of search using
 *                       the correct metric (spherical/flat, radians/meters).
 *
 * For example - given a distance search over documents with distances from [0 -> 10], the child
 * stage might break up the search into intervals [0->5),[5,7),[7->10].
 *
 * Each interval requires a PlanStage which returns all of the results in the interval that have
 * not been buffered in a previous interval.  Results in each interval are buffered fully before
 * being returned to ensure that ordering is preserved. Results that are in the cover, but outside
 * the outer bounds of the current interval will remain buffered to be returned in subsequent
 * search intervals.
 *
 * For efficient search, the child stage should not return too many results outside the interval,
 * but correctness only depends on all the results in the interval being buffered before any are
 * returned. As an example, a PlanStage for the interval [0->5) might just be a full collection
 * scan - this will always buffer every result in the interval, but is slow.  If there is an index
 * available, an IndexScan stage might also return all documents with distance [0->5) but
 * would be much faster.
 *
 * Also for efficient search, the intervals should not be too large or too small - though again
 * correctness does not depend on interval size.
 *
 * The child stage may return duplicate documents, so it is the responsibility of NearStage to
 * deduplicate. Every document in _resultBuffer is kept track of in _seenDocuments. When a document
 * is returned, it is removed from _seenDocuments.
 *
 * TODO: Right now the interface allows the nextCovering() to be adaptive, but doesn't allow
 * aborting and shrinking a covered range being buffered if we guess wrong.
 */
class NearStage : public RequiresIndexStage {
public:
    struct CoveredInterval;

    ~NearStage();

    bool isEOF() final;
    StageState doWork(WorkingSetID* out) final;

    StageType stageType() const final;
    std::unique_ptr<PlanStageStats> getStats() final;
    const SpecificStats* getSpecificStats() const final;

protected:
    /**
     * Subclasses of NearStage must provide basics + a stats object which gets owned here.
     */
    NearStage(OperationContext* opCtx,
              const char* typeName,
              StageType type,
              WorkingSet* workingSet,
              const IndexDescriptor* indexDescriptor);

    //
    // Methods implemented for specific search functionality
    //

    /**
     * Constructs the next covering over the next interval to buffer results from, or NULL
     * if the full range has been searched.  Use the provided working set as the working
     * set for the covering stage if required.
     *
     * Returns !OK on failure to create next stage.
     */
    virtual StatusWith<CoveredInterval*> nextInterval(OperationContext* opCtx,
                                                      WorkingSet* workingSet,
                                                      const Collection* collection) = 0;

    /**
     * Computes the distance value for the given member data, or -1 if the member should not be
     * returned in the sorted results.
     *
     * Returns !OK on invalid member data.
     */
    virtual StatusWith<double> computeDistance(WorkingSetMember* member) = 0;

    /*
     * Initialize near stage before buffering the data.
     * Return IS_EOF if subclass finishes the initialization.
     * Return NEED_TIME if we need more time.
     * Return errors if an error occurs.
     * Can't return ADVANCED.
     */
    virtual StageState initialize(OperationContext* opCtx,
                                  WorkingSet* workingSet,
                                  WorkingSetID* out) = 0;

    void doSaveStateRequiresIndex() final {}

    void doRestoreStateRequiresIndex() final {}

    // Filled in by subclasses.
    NearStats _specificStats;

private:
    //
    // Generic methods for progressive search functionality
    //

    StageState initNext(WorkingSetID* out);
    StageState bufferNext(WorkingSetID* toReturn, Status* error);
    StageState advanceNext(WorkingSetID* toReturn);

    //
    // Generic state for progressive near search
    //

    WorkingSet* const _workingSet;

    // A progressive search works in stages of buffering and then advancing
    enum SearchState {
        SearchState_Initializing,
        SearchState_Buffering,
        SearchState_Advancing,
        SearchState_Finished
    } _searchState;

    // Tracks RecordIds from the child stage to do our own deduping.
    stdx::unordered_map<RecordId, WorkingSetID, RecordId::Hasher> _seenDocuments;

    // Stats for the stage covering this interval
    // This is owned by _specificStats
    IntervalStats* _nextIntervalStats;

    // Sorted buffered results to be returned - the current interval
    struct SearchResult;
    std::priority_queue<SearchResult> _resultBuffer;

    // Stats
    const StageType _stageType;

    // The current stage from which this stage should buffer results
    // Pointer to the last interval in _childrenIntervals. Owned by _childrenIntervals.
    CoveredInterval* _nextInterval;

    // All children CoveredIntervals and the sub-stages owned by them.
    //
    // All children intervals except the last active one are only used by getStats(),
    // because they are all EOF.
    std::vector<std::unique_ptr<CoveredInterval>> _childrenIntervals;
};

/**
 * A covered interval over which a portion of a near search can be run.
 */
struct NearStage::CoveredInterval {
    CoveredInterval(PlanStage* covering, double minDistance, double maxDistance, bool inclusiveMax);

    PlanStage* const covering;  // Owned in PlanStage::_children.

    const double minDistance;
    const double maxDistance;
    const bool inclusiveMax;
};

}  // namespace mongo