src/mongo/s/client/shard_registry.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430

/**
 *    Copyright (C) 2015 MongoDB Inc.
 *
 *    This program is free software: you can redistribute it and/or  modify
 *    it under the terms of the GNU Affero General Public License, version 3,
 *    as published by the Free Software Foundation.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU Affero General Public License for more details.
 *
 *    You should have received a copy of the GNU Affero General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 *    As a special exception, the copyright holders give permission to link the
 *    code of portions of this program with the OpenSSL library under certain
 *    conditions as described in each individual source file and distribute
 *    linked combinations including the program with the OpenSSL library. You
 *    must comply with the GNU Affero General Public License in all respects for
 *    all of the code used other than as permitted herein. If you modify file(s)
 *    with this exception, you may extend this exception to your version of the
 *    file(s), but you are not obligated to do so. If you do not wish to do so,
 *    delete this exception statement from your version. If you delete this
 *    exception statement from all source files in the program, then also delete
 *    it in the license file.
 */

#pragma once

#include <boost/optional.hpp>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>

#include "mongo/base/disallow_copying.h"
#include "mongo/db/jsobj.h"
#include "mongo/db/repl/optime.h"
#include "mongo/db/repl/read_concern_args.h"
#include "mongo/executor/task_executor_pool.h"
#include "mongo/platform/unordered_set.h"
#include "mongo/s/client/shard.h"
#include "mongo/stdx/mutex.h"

namespace mongo {

class BSONObjBuilder;
class CatalogManager;
struct HostAndPort;
class NamespaceString;
class OperationContext;
class RemoteCommandTargeterFactory;
class Shard;
class ShardType;
struct ReadPreferenceSetting;

template <typename T>
class StatusWith;

namespace executor {

class NetworkInterface;
class TaskExecutor;

}  // namespace executor

/**
 * Maintains the set of all shards known to the instance and their connections and exposes
 * functionality to run commands against shards. All commands which this registry executes are
 * retried on NotMaster class of errors and in addition all read commands are retried on network
 * errors automatically as well.
 */
class ShardRegistry {
    MONGO_DISALLOW_COPYING(ShardRegistry);

public:
    struct QueryResponse {
        std::vector<BSONObj> docs;
        repl::OpTime opTime;
    };

    /**
     * Instantiates a new shard registry.
     *
     * @param targeterFactory Produces targeters for each shard's individual connection string
     * @param commandRunner Command runner for executing commands against hosts
     * @param executor Asynchronous task executor to use for making calls to shards and
     *     config servers.
     * @param network Network interface backing executor.
     * @param addShardExecutor Asynchronous task executor to use for making calls to nodes that
     *     are not yet in the ShardRegistry
     * @param configServerCS ConnectionString used for communicating with the config servers
     */
    ShardRegistry(std::unique_ptr<RemoteCommandTargeterFactory> targeterFactory,
                  std::unique_ptr<executor::TaskExecutorPool> executorPool,
                  executor::NetworkInterface* network,
                  std::unique_ptr<executor::TaskExecutor> addShardExecutor,
                  ConnectionString configServerCS);

    ~ShardRegistry();

    /**
     * Invoked when the connection string for the config server changes. Updates the config server
     * connection string and recreates the config server's shard.
     */
    void updateConfigServerConnectionString(ConnectionString configServerCS);

    /**
     * Invokes the executor's startup method, which will start any networking/async execution
     * threads.
     */
    void startup();

    /**
     * Stops the executor thread and waits for it to join.
     */
    void shutdown();

    executor::TaskExecutor* getExecutor() const {
        return _executorPool->getFixedExecutor();
    }

    executor::TaskExecutorPool* getExecutorPool() const {
        return _executorPool.get();
    }

    executor::NetworkInterface* getNetwork() const {
        return _network;
    }

    ConnectionString getConfigServerConnectionString() const {
        return _configServerCS;
    }

    /**
     * Reloads the ShardRegistry based on the contents of the config server's config.shards
     * collection.
     */
    void reload(OperationContext* txn);

    /**
     * Throws out and reconstructs the config shard.  This has the effect that if replica set
     * monitoring of the config server replica set has stopped (because the set was down for too
     * long), this will cause the ReplicaSetMonitor to be rebuilt, which will re-trigger monitoring
     * of the config replica set to resume.
     */
    void rebuildConfigShard();

    /**
     * Updates _lookup and _rsLookup based on the given new version of the given Shard's
     * ConnectionString. If ConnectionString is not specified the maps are updated with hosts
     * from the ReplicaSetMonitor.
     * Used to update the ShardRegistry when a change in replica set membership is detected by the
     * ReplicaSetMonitor.
     */
    void updateLookupMapsForShard(std::shared_ptr<Shard> shard,
                                  boost::optional<const ConnectionString&> newConnString);

    /**
     * Returns a shared pointer to the shard object with the given shard id.
     * May refresh the shard registry if there's no cached information about the shard. The shardId
     * parameter can actually be the shard name or the HostAndPort for any
     * server in the shard.
     */
    std::shared_ptr<Shard> getShard(OperationContext* txn, const ShardId& shardId);

    /**
     * Returns a shared pointer to the shard object with the given shard id. The shardId parameter
     * can actually be the shard name or the HostAndPort for any server in the shard. Will not
     * refresh the shard registry or otherwise perform any network traffic. This means that if the
     * shard was recently added it may not be found.  USE WITH CAUTION.
     */
    std::shared_ptr<Shard> getShardNoReload(const ShardId& shardId);

    /**
     * Finds the Shard that the mongod listening at this HostAndPort is a member of. Will not
     * refresh the shard registry or otherwise perform any network traffic.
     */
    std::shared_ptr<Shard> getShardForHostNoReload(const HostAndPort& shardHost);

    /**
     * Returns shared pointer to the shard object representing the config servers.
     */
    std::shared_ptr<Shard> getConfigShard();

    /**
     * Instantiates a new detached shard connection, which does not appear in the list of shards
     * tracked by the registry and as a result will not be returned by getAllShardIds.
     *
     * The caller owns the returned shard object and is responsible for disposing of it when done.
     *
     * @param connStr Connection string to the shard.
     */
    std::unique_ptr<Shard> createConnection(const ConnectionString& connStr) const;

    /**
     * Lookup shard by replica set name. Returns nullptr if the name can't be found.
     * Note: this doesn't refresh the table if the name isn't found, so it's possible that a
     * newly added shard/Replica Set may not be found.
     */
    std::shared_ptr<Shard> lookupRSName(const std::string& name) const;

    void remove(const ShardId& id);

    void getAllShardIds(std::vector<ShardId>* all) const;

    void toBSON(BSONObjBuilder* result);

    /**
     * If the newly specified optime is newer than the one the ShardRegistry already knows, the
     * one in the registry will be advanced. Otherwise, it remains the same.
     */
    void advanceConfigOpTime(repl::OpTime opTime);

    /**
     * Returns the last known OpTime of the config servers.
     */
    repl::OpTime getConfigOpTime();

    /**
     * Executes 'find' command against a config server matching the given read preference, and
     * fetches *all* the results that the host will return until there are no more or until an error
     * is returned.
     *
     * Returns either the complete set of results or an error, never partial results.
     *
     * Note: should never be used outside of CatalogManagerReplicaSet or DistLockCatalogImpl.
     */
    StatusWith<QueryResponse> exhaustiveFindOnConfig(OperationContext* txn,
                                                     const ReadPreferenceSetting& readPref,
                                                     const NamespaceString& nss,
                                                     const BSONObj& query,
                                                     const BSONObj& sort,
                                                     boost::optional<long long> limit);

    /**
     * Runs a command against a host belonging to the specified shard and matching the given
     * readPref, and returns the result.  It is the responsibility of the caller to check the
     * returned BSON for command-specific failures.
     */
    StatusWith<BSONObj> runCommandOnShard(OperationContext* txn,
                                          const std::shared_ptr<Shard>& shard,
                                          const ReadPreferenceSetting& readPref,
                                          const std::string& dbName,
                                          const BSONObj& cmdObj);
    StatusWith<BSONObj> runCommandOnShard(OperationContext* txn,
                                          ShardId shardId,
                                          const ReadPreferenceSetting& readPref,
                                          const std::string& dbName,
                                          const BSONObj& cmdObj);


    /**
     * Same as runCommandOnShard above but used for talking to nodes that are not yet in the
     * ShardRegistry.
     */
    StatusWith<BSONObj> runCommandForAddShard(OperationContext* txn,
                                              const std::shared_ptr<Shard>& shard,
                                              const ReadPreferenceSetting& readPref,
                                              const std::string& dbName,
                                              const BSONObj& cmdObj);

    /**
     * Runs a command against a config server that matches the given read preference, and returns
     * the result.  It is the responsibility of the caller to check the returned BSON for
     * command-specific failures.
     */
    StatusWith<BSONObj> runCommandOnConfig(OperationContext* txn,
                                           const ReadPreferenceSetting& readPref,
                                           const std::string& dbname,
                                           const BSONObj& cmdObj);

    /**
     * Helpers for running commands against a given shard with logic for retargeting and
     * retrying the command in the event of a NotMaster response.
     * Returns ErrorCodes::NotMaster if after the max number of retries we still haven't
     * successfully delivered the command to a primary.  Can also return a non-ok status in the
     * event of a network error communicating with the shard.  If we are able to get
     * a valid response from running the command then we will return it, even if the command
     * response indicates failure.  Thus the caller is responsible for checking the command
     * response object for any kind of command-specific failure.  The only exception is
     * NotMaster errors, which we intercept and follow the rules described above for handling.
     */
    StatusWith<BSONObj> runCommandWithNotMasterRetries(OperationContext* txn,
                                                       const ShardId& shard,
                                                       const std::string& dbname,
                                                       const BSONObj& cmdObj);

    class ErrorCodesHash {
    public:
        size_t operator()(ErrorCodes::Error e) const {
            return std::hash<typename std::underlying_type<ErrorCodes::Error>::type>()(e);
        }
    };

    using ErrorCodesSet = unordered_set<ErrorCodes::Error, ErrorCodesHash>;

    /**
     * Runs commands against the config shard's primary. Retries if executing the command fails with
     * one of the given error codes, or if executing the command succeeds but the server returned
     * one of the codes. If executing the command fails with a different code we return that code.
     * If executing the command succeeds and the command itself succeeds or fails with a code not in
     * the set, then we return the command response object. Thus the caller is responsible for
     * checking the command response object for any kind of command-specific failures other than
     * those specified in errorsToCheck.
     */
    StatusWith<BSONObj> runCommandOnConfigWithRetries(OperationContext* txn,
                                                      const std::string& dbname,
                                                      const BSONObj& cmdObj,
                                                      const ErrorCodesSet& errorsToCheck);

    /**
     * Notifies the specified RemoteCommandTargeter of a particular mode of failure for the
     * specified host.
     */
    static void updateReplSetMonitor(const std::shared_ptr<RemoteCommandTargeter>& targeter,
                                     const HostAndPort& remoteHost,
                                     const Status& remoteCommandStatus);

    /**
     * Set of error codes, which indicate that the remote host is not the current master. Retries on
     * errors from this set are always safe and should be used by default.
     */
    static const ErrorCodesSet kNotMasterErrors;

    /**
     * Set of error codes which includes NotMaster and all other exceptions on which it is okay to
     * retry the operation, but the retries may require some additional idempotency guarantees
     * imposed by the calling code.
     */
    static const ErrorCodesSet kAllRetriableErrors;

private:
    using ShardMap = std::unordered_map<ShardId, std::shared_ptr<Shard>>;

    struct CommandResponse {
        BSONObj response;
        BSONObj metadata;
        repl::OpTime visibleOpTime;
    };

    /**
     * Creates a shard based on the specified information and puts it into the lookup maps.
     */
    void _addShard_inlock(const ShardType& shardType, bool passHostName);

    /**
     * Adds the "config" shard (representing the config server) to the shard registry.
     */
    void _addConfigShard_inlock();

    void _updateLookupMapsForShard_inlock(std::shared_ptr<Shard> shard,
                                          boost::optional<const ConnectionString&> newConnString);

    std::shared_ptr<Shard> _findUsingLookUp(const ShardId& shardId);

    /**
     * Runs a command against the specified host, checks the returned reply (if any) for
     * errorsToCheck and returns the result. If the command succeeds, it is the responsibility
     * of the caller to check the returned BSON for command-specific failures.
     */
    StatusWith<CommandResponse> _runCommandWithMetadata(OperationContext* txn,
                                                        executor::TaskExecutor* executor,
                                                        const std::shared_ptr<Shard>& shard,
                                                        const ReadPreferenceSetting& readPref,
                                                        const std::string& dbName,
                                                        const BSONObj& cmdObj,
                                                        const BSONObj& metadata,
                                                        const ErrorCodesSet& errorsToCheck);

    StatusWith<QueryResponse> _exhaustiveFindOnConfig(OperationContext* txn,
                                                      const ReadPreferenceSetting& readPref,
                                                      const NamespaceString& nss,
                                                      const BSONObj& query,
                                                      const BSONObj& sort,
                                                      boost::optional<long long> limit);


    /**
     * Runs a command cmdObj, extracts an error code from its result and retries if its in the
     * errorsToCheck set or reaches the max number of retries.
     */
    StatusWith<CommandResponse> _runCommandWithRetries(OperationContext* txn,
                                                       executor::TaskExecutor* executor,
                                                       const std::shared_ptr<Shard>& shard,
                                                       const ReadPreferenceSetting& readPref,
                                                       const std::string& dbname,
                                                       const BSONObj& cmdObj,
                                                       const BSONObj& metadata,
                                                       const ErrorCodesSet& errorsToCheck);

    // Factory to obtain remote command targeters for shards
    const std::unique_ptr<RemoteCommandTargeterFactory> _targeterFactory;

    // Executor pool for scheduling work and remote commands to shards and config servers. Each
    // contained executor has a connection hook set on it for initialization sharding data on shards
    // and detecting if the catalog manager needs swapping.
    const std::unique_ptr<executor::TaskExecutorPool> _executorPool;

    // Network interface being used by _executor.  Used for asking questions about the network
    // configuration, such as getting the current server's hostname.
    executor::NetworkInterface* const _network;

    // Executor specifically used for sending commands to servers that are in the process of being
    // added as shards.  Does not have any connection hook set on it.
    const std::unique_ptr<executor::TaskExecutor> _executorForAddShard;

    // Protects the config server connections string, _configOpTime, and the lookup maps below
    mutable stdx::mutex _mutex;

    // Config server connection string
    ConnectionString _configServerCS;

    // Last known highest opTime from the config server that should be used when doing reads.
    repl::OpTime _configOpTime;

    // Config server OpTime of the query run during the last successful ShardRegistry::reload() call
    repl::OpTime _lastReloadOpTime;

    // Map of both shardName -> Shard and hostName -> Shard
    ShardMap _lookup;

    // Map from replica set name to shard corresponding to this replica set
    ShardMap _rsLookup;

    std::unordered_map<HostAndPort, std::shared_ptr<Shard>> _hostLookup;
};

}  // namespace mongo