/**
* Copyright (C) 2013-2014 MongoDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the GNU Affero General Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/
#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kQuery
#include "mongo/platform/basic.h"
#include "mongo/db/query/find.h"
#include "mongo/client/dbclientinterface.h"
#include "mongo/db/catalog/collection.h"
#include "mongo/db/catalog/database_holder.h"
#include "mongo/db/client.h"
#include "mongo/db/clientcursor.h"
#include "mongo/db/commands.h"
#include "mongo/db/curop.h"
#include "mongo/db/db_raii.h"
#include "mongo/db/exec/filter.h"
#include "mongo/db/exec/working_set_common.h"
#include "mongo/db/service_context.h"
#include "mongo/db/keypattern.h"
#include "mongo/db/query/explain.h"
#include "mongo/db/query/find_common.h"
#include "mongo/db/query/get_executor.h"
#include "mongo/db/query/internal_plans.h"
#include "mongo/db/query/query_planner_params.h"
#include "mongo/db/repl/replication_coordinator_global.h"
#include "mongo/db/s/sharding_state.h"
#include "mongo/db/server_options.h"
#include "mongo/db/server_parameters.h"
#include "mongo/db/storage_options.h"
#include "mongo/s/chunk_version.h"
#include "mongo/s/stale_exception.h"
#include "mongo/stdx/memory.h"
#include "mongo/util/fail_point_service.h"
#include "mongo/util/log.h"
#include "mongo/util/mongoutils/str.h"
namespace mongo {
using std::endl;
using std::unique_ptr;
using stdx::make_unique;
// Failpoint for checking whether we've received a getmore.
MONGO_FP_DECLARE(failReceivedGetmore);
bool isCursorTailable(const ClientCursor* cursor) {
return cursor->queryOptions() & QueryOption_CursorTailable;
}
bool isCursorAwaitData(const ClientCursor* cursor) {
return cursor->queryOptions() & QueryOption_AwaitData;
}
bool shouldSaveCursor(OperationContext* txn,
const Collection* collection,
PlanExecutor::ExecState finalState,
PlanExecutor* exec) {
if (PlanExecutor::FAILURE == finalState || PlanExecutor::DEAD == finalState) {
return false;
}
const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed();
if (!pq.wantMore() && !pq.isTailable()) {
return false;
}
if (pq.getNToReturn().value_or(0) == 1) {
return false;
}
// We keep a tailable cursor around unless the collection we're tailing has no
// records.
//
// SERVER-13955: we should be able to create a tailable cursor that waits on
// an empty collection. Right now we do not keep a cursor if the collection
// has zero records.
if (pq.isTailable()) {
return collection && collection->numRecords(txn) != 0U;
}
return !exec->isEOF();
}
bool shouldSaveCursorGetMore(PlanExecutor::ExecState finalState,
PlanExecutor* exec,
bool isTailable) {
if (PlanExecutor::FAILURE == finalState || PlanExecutor::DEAD == finalState) {
return false;
}
if (isTailable) {
return true;
}
return !exec->isEOF();
}
void beginQueryOp(OperationContext* txn,
const NamespaceString& nss,
const BSONObj& queryObj,
long long ntoreturn,
long long ntoskip) {
auto curop = CurOp::get(txn);
curop->debug().query = queryObj;
curop->debug().ntoreturn = ntoreturn;
curop->debug().ntoskip = ntoskip;
stdx::lock_guard lk(*txn->getClient());
curop->setQuery_inlock(queryObj);
curop->setNS_inlock(nss.ns());
}
void endQueryOp(OperationContext* txn,
Collection* collection,
const PlanExecutor& exec,
int dbProfilingLevel,
long long numResults,
CursorId cursorId) {
auto curop = CurOp::get(txn);
// Fill out basic curop query exec properties.
curop->debug().nreturned = numResults;
curop->debug().cursorid = (0 == cursorId ? -1 : cursorId);
curop->debug().cursorExhausted = (0 == cursorId);
// Fill out curop based on explain summary statistics.
PlanSummaryStats summaryStats;
Explain::getSummaryStats(exec, &summaryStats);
curop->debug().hasSortStage = summaryStats.hasSortStage;
curop->debug().keysExamined = summaryStats.totalKeysExamined;
curop->debug().docsExamined = summaryStats.totalDocsExamined;
curop->debug().idhack = summaryStats.isIdhack;
if (collection) {
collection->infoCache()->notifyOfQuery(txn, summaryStats.indexesUsed);
}
const logger::LogComponent queryLogComponent = logger::LogComponent::kQuery;
const logger::LogSeverity logLevelOne = logger::LogSeverity::Debug(1);
// Set debug information for consumption by the profiler and slow query log.
if (dbProfilingLevel > 0 || curop->elapsedMillis() > serverGlobalParams.slowMS ||
logger::globalLogDomain()->shouldLog(queryLogComponent, logLevelOne)) {
// Generate plan summary string.
curop->debug().planSummary = Explain::getPlanSummary(&exec);
}
// Set debug information for consumption by the profiler only.
if (dbProfilingLevel > 0) {
// Get BSON stats.
unique_ptr execStats(exec.getStats());
BSONObjBuilder statsBob;
Explain::statsToBSON(*execStats, &statsBob);
curop->debug().execStats.set(statsBob.obj());
// Replace exec stats with plan summary if stats cannot fit into CachedBSONObj.
if (curop->debug().execStats.tooBig() && !curop->debug().planSummary.empty()) {
BSONObjBuilder bob;
bob.append("summary", curop->debug().planSummary.toString());
curop->debug().execStats.set(bob.done());
}
}
}
namespace {
/**
* Uses 'cursor' to fill out 'bb' with the batch of result documents to
* be returned by this getMore.
*
* Returns the number of documents in the batch in 'numResults', which must be initialized to
* zero by the caller. Returns the final ExecState returned by the cursor in *state. Returns
* whether or not to save the ClientCursor in 'shouldSaveCursor'. Returns the slave's time to
* read until in 'slaveReadTill' (for master/slave).
*
* Returns an OK status if the batch was successfully generated, and a non-OK status if the
* PlanExecutor encounters a failure.
*/
void generateBatch(int ntoreturn,
ClientCursor* cursor,
BufBuilder* bb,
int* numResults,
Timestamp* slaveReadTill,
PlanExecutor::ExecState* state) {
PlanExecutor* exec = cursor->getExecutor();
BSONObj obj;
while (PlanExecutor::ADVANCED == (*state = exec->getNext(&obj, NULL))) {
// Add result to output buffer.
bb->appendBuf((void*)obj.objdata(), obj.objsize());
// Count the result.
(*numResults)++;
// Possibly note slave's position in the oplog.
if (cursor->queryOptions() & QueryOption_OplogReplay) {
BSONElement e = obj["ts"];
if (BSONType::Date == e.type() || BSONType::bsonTimestamp == e.type()) {
*slaveReadTill = e.timestamp();
}
}
if (FindCommon::enoughForGetMore(ntoreturn, *numResults, bb->len())) {
break;
}
}
if (PlanExecutor::DEAD == *state || PlanExecutor::FAILURE == *state) {
// Propagate this error to caller.
const unique_ptr stats(exec->getStats());
error() << "getMore executor error, stats: " << Explain::statsToBSON(*stats);
uasserted(17406, "getMore executor error: " + WorkingSetCommon::toStatusString(obj));
}
}
} // namespace
/**
* Called by db/instance.cpp. This is the getMore entry point.
*/
QueryResult::View getMore(OperationContext* txn,
const char* ns,
int ntoreturn,
long long cursorid,
bool* exhaust,
bool* isCursorAuthorized) {
CurOp& curop = *CurOp::get(txn);
// For testing, we may want to fail if we receive a getmore.
if (MONGO_FAIL_POINT(failReceivedGetmore)) {
invariant(0);
}
*exhaust = false;
const NamespaceString nss(ns);
// Depending on the type of cursor being operated on, we hold locks for the whole getMore,
// or none of the getMore, or part of the getMore. The three cases in detail:
//
// 1) Normal cursor: we lock with "ctx" and hold it for the whole getMore.
// 2) Cursor owned by global cursor manager: we don't lock anything. These cursors don't own
// any collection state. These cursors are generated either by the listCollections or
// listIndexes commands, as these special cursor-generating commands operate over catalog
// data rather than targeting the data within a collection.
// 3) Agg cursor: we lock with "ctx", then release, then relock with "unpinDBLock" and
// "unpinCollLock". This is because agg cursors handle locking internally (hence the
// release), but the pin and unpin of the cursor must occur under the collection lock.
// We don't use our AutoGetCollectionForRead "ctx" to relock, because
// AutoGetCollectionForRead checks the sharding version (and we want the relock for the
// unpin to succeed even if the sharding version has changed).
//
// Note that we declare our locks before our ClientCursorPin, in order to ensure that the
// pin's destructor is called before the lock destructors (so that the unpin occurs under
// the lock).
unique_ptr ctx;
unique_ptr unpinDBLock;
unique_ptr unpinCollLock;
CursorManager* cursorManager;
if (nss.isListIndexesCursorNS() || nss.isListCollectionsCursorNS()) {
// List collections and list indexes are special cursor-generating commands whose
// cursors are managed globally, as they operate over catalog data rather than targeting
// the data within a collection.
cursorManager = CursorManager::getGlobalCursorManager();
} else {
ctx = stdx::make_unique(txn, nss);
Collection* collection = ctx->getCollection();
uassert(17356, "collection dropped between getMore calls", collection);
cursorManager = collection->getCursorManager();
}
LOG(5) << "Running getMore, cursorid: " << cursorid << endl;
// This checks to make sure the operation is allowed on a replicated node. Since we are not
// passing in a query object (necessary to check SlaveOK query option), the only state where
// reads are allowed is PRIMARY (or master in master/slave). This function uasserts if
// reads are not okay.
Status status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(txn, nss, true);
uassertStatusOK(status);
// A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it
// doesn't time out. Also informs ClientCursor that there is somebody actively holding the
// CC, so don't delete it.
ClientCursorPin ccPin(cursorManager, cursorid);
ClientCursor* cc = ccPin.c();
// These are set in the QueryResult msg we return.
int resultFlags = ResultFlag_AwaitCapable;
int numResults = 0;
int startingResult = 0;
const int InitialBufSize =
512 + sizeof(QueryResult::Value) + FindCommon::kMaxBytesToReturnToClientAtOnce;
BufBuilder bb(InitialBufSize);
bb.skip(sizeof(QueryResult::Value));
if (NULL == cc) {
cursorid = 0;
resultFlags = ResultFlag_CursorNotFound;
} else {
// Check for spoofing of the ns such that it does not match the one originally
// there for the cursor.
uassert(ErrorCodes::Unauthorized,
str::stream() << "Requested getMore on namespace " << ns << ", but cursor "
<< cursorid << " belongs to namespace " << cc->ns(),
ns == cc->ns());
*isCursorAuthorized = true;
if (cc->isReadCommitted())
uassertStatusOK(txn->recoveryUnit()->setReadFromMajorityCommittedSnapshot());
// Reset timeout timer on the cursor since the cursor is still in use.
cc->setIdleTime(0);
// If the operation that spawned this cursor had a time limit set, apply leftover
// time to this getmore.
curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros());
txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.
// Ensure that the original query or command object is available in the slow query log,
// profiler, and currentOp.
curop.debug().query = cc->getQuery();
{
stdx::lock_guard lk(*txn->getClient());
curop.setQuery_inlock(cc->getQuery());
}
cc->updateSlaveLocation(txn);
if (cc->isAggCursor()) {
// Agg cursors handle their own locking internally.
ctx.reset(); // unlocks
}
// If we're replaying the oplog, we save the last time that we read.
Timestamp slaveReadTill;
// What number result are we starting at? Used to fill out the reply.
startingResult = cc->pos();
PlanExecutor* exec = cc->getExecutor();
exec->reattachToOperationContext(txn);
exec->restoreState();
PlanExecutor::ExecState state;
generateBatch(ntoreturn, cc, &bb, &numResults, &slaveReadTill, &state);
// If this is an await data cursor, and we hit EOF without generating any results, then
// we block waiting for new data to arrive.
if (isCursorAwaitData(cc) && state == PlanExecutor::IS_EOF && numResults == 0) {
// Retrieve the notifier which we will wait on until new data arrives. We make sure
// to do this in the lock because once we drop the lock it is possible for the
// collection to become invalid. The notifier itself will outlive the collection if
// the collection is dropped, as we keep a shared_ptr to it.
auto notifier = ctx->getCollection()->getCappedInsertNotifier();
// Save the PlanExecutor and drop our locks.
exec->saveState();
ctx.reset();
// Block waiting for data for up to 1 second.
Seconds timeout(1);
notifier->wait(timeout);
notifier.reset();
// Set expected latency to match wait time. This makes sure the logs aren't spammed
// by awaitData queries that exceed slowms due to blocking on the CappedInsertNotifier.
curop.setExpectedLatencyMs(durationCount(timeout));
// Reacquiring locks.
ctx = make_unique(txn, nss);
exec->restoreState();
// We woke up because either the timed_wait expired, or there was more data. Either
// way, attempt to generate another batch of results.
generateBatch(ntoreturn, cc, &bb, &numResults, &slaveReadTill, &state);
}
// We have to do this before re-acquiring locks in the agg case because
// shouldSaveCursorGetMore() can make a network call for agg cursors.
//
// TODO: Getting rid of PlanExecutor::isEOF() in favor of PlanExecutor::IS_EOF would mean
// that this network operation is no longer necessary.
const bool shouldSaveCursor = shouldSaveCursorGetMore(state, exec, isCursorTailable(cc));
// In order to deregister a cursor, we need to be holding the DB + collection lock and
// if the cursor is aggregation, we release these locks.
if (cc->isAggCursor()) {
invariant(NULL == ctx.get());
unpinDBLock = make_unique(txn->lockState(), nss.db(), MODE_IS);
unpinCollLock = make_unique(txn->lockState(), nss.ns(), MODE_IS);
}
// Our two possible ClientCursorPin cleanup paths are:
// 1) If the cursor is not going to be saved, we call deleteUnderlying() on the pin.
// 2) If the cursor is going to be saved, we simply let the pin go out of scope. In
// this case, the pin's destructor will be invoked, which will call release() on the
// pin. Because our ClientCursorPin is declared after our lock is declared, this
// will happen under the lock.
if (!shouldSaveCursor) {
ccPin.deleteUnderlying();
// cc is now invalid, as is the executor
cursorid = 0;
cc = NULL;
curop.debug().cursorExhausted = true;
LOG(5) << "getMore NOT saving client cursor, ended with state "
<< PlanExecutor::statestr(state) << endl;
} else {
// Continue caching the ClientCursor.
cc->incPos(numResults);
exec->saveState();
exec->detachFromOperationContext();
LOG(5) << "getMore saving client cursor ended with state "
<< PlanExecutor::statestr(state) << endl;
// Possibly note slave's position in the oplog.
if ((cc->queryOptions() & QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
cc->slaveReadTill(slaveReadTill);
}
*exhaust = cc->queryOptions() & QueryOption_Exhaust;
// If the getmore had a time limit, remaining time is "rolled over" back to the
// cursor (for use by future getmore ops).
cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());
}
}
QueryResult::View qr = bb.buf();
qr.msgdata().setLen(bb.len());
qr.msgdata().setOperation(opReply);
qr.setResultFlags(resultFlags);
qr.setCursorId(cursorid);
qr.setStartingFrom(startingResult);
qr.setNReturned(numResults);
bb.decouple();
LOG(5) << "getMore returned " << numResults << " results\n";
return qr;
}
std::string runQuery(OperationContext* txn,
QueryMessage& q,
const NamespaceString& nss,
Message& result) {
CurOp& curop = *CurOp::get(txn);
// Validate the namespace.
uassert(16256, str::stream() << "Invalid ns [" << nss.ns() << "]", nss.isValid());
invariant(!nss.isCommand());
// Set curop information.
beginQueryOp(txn, nss, q.query, q.ntoreturn, q.ntoskip);
// Parse the qm into a CanonicalQuery.
auto statusWithCQ = CanonicalQuery::canonicalize(q, WhereCallbackReal(txn, nss.db()));
if (!statusWithCQ.isOK()) {
uasserted(
17287,
str::stream() << "Can't canonicalize query: " << statusWithCQ.getStatus().toString());
}
unique_ptr cq = std::move(statusWithCQ.getValue());
invariant(cq.get());
LOG(5) << "Running query:\n" << cq->toString();
LOG(2) << "Running query: " << cq->toStringShort();
// Parse, canonicalize, plan, transcribe, and get a plan executor.
AutoGetCollectionForRead ctx(txn, nss);
Collection* collection = ctx.getCollection();
const int dbProfilingLevel =
ctx.getDb() ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile;
// We have a parsed query. Time to get the execution plan for it.
std::unique_ptr exec = uassertStatusOK(
getExecutorFind(txn, collection, nss, std::move(cq), PlanExecutor::YIELD_AUTO));
const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed();
// If it's actually an explain, do the explain and return rather than falling through
// to the normal query execution loop.
if (pq.isExplain()) {
BufBuilder bb;
bb.skip(sizeof(QueryResult::Value));
BSONObjBuilder explainBob;
Explain::explainStages(exec.get(), ExplainCommon::EXEC_ALL_PLANS, &explainBob);
// Add the resulting object to the return buffer.
BSONObj explainObj = explainBob.obj();
bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize());
// TODO: Does this get overwritten/do we really need to set this twice?
curop.debug().query = q.query;
// Set query result fields.
QueryResult::View qr = bb.buf();
bb.decouple();
qr.setResultFlagsToOk();
qr.msgdata().setLen(bb.len());
curop.debug().responseLength = bb.len();
qr.msgdata().setOperation(opReply);
qr.setCursorId(0);
qr.setStartingFrom(0);
qr.setNReturned(1);
result.setData(qr.view2ptr(), true);
return "";
}
ShardingState* const shardingState = ShardingState::get(txn);
// We freak out later if this changes before we're done with the query.
const ChunkVersion shardingVersionAtStart = shardingState->getVersion(nss.ns());
// Handle query option $maxTimeMS (not used with commands).
curop.setMaxTimeMicros(static_cast(pq.getMaxTimeMS()) * 1000);
txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.
// uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set.
bool slaveOK = pq.isSlaveOk() || pq.hasReadPref();
Status serveReadsStatus =
repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(txn, nss, slaveOK);
uassertStatusOK(serveReadsStatus);
// Run the query.
// bb is used to hold query results
// this buffer should contain either requested documents per query or
// explain information, but not both
BufBuilder bb(32768);
bb.skip(sizeof(QueryResult::Value));
// How many results have we obtained from the executor?
int numResults = 0;
// If we're replaying the oplog, we save the last time that we read.
Timestamp slaveReadTill;
BSONObj obj;
PlanExecutor::ExecState state;
// uint64_t numMisplacedDocs = 0;
// Get summary info about which plan the executor is using.
curop.debug().planSummary = Explain::getPlanSummary(exec.get());
while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
// Add result to output buffer.
bb.appendBuf((void*)obj.objdata(), obj.objsize());
// Count the result.
++numResults;
// Possibly note slave's position in the oplog.
if (pq.isOplogReplay()) {
BSONElement e = obj["ts"];
if (Date == e.type() || bsonTimestamp == e.type()) {
slaveReadTill = e.timestamp();
}
}
if (FindCommon::enoughForFirstBatch(pq, numResults, bb.len())) {
LOG(5) << "Enough for first batch, wantMore=" << pq.wantMore()
<< " ntoreturn=" << pq.getNToReturn().value_or(0) << " numResults=" << numResults
<< endl;
break;
}
}
// If we cache the executor later, we want to deregister it as it receives notifications
// anyway by virtue of being cached.
//
// If we don't cache the executor later, we are deleting it, so it must be deregistered.
//
// So, no matter what, deregister the executor.
exec->deregisterExec();
// Caller expects exceptions thrown in certain cases.
if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) {
const unique_ptr stats(exec->getStats());
error() << "Plan executor error during find: " << PlanExecutor::statestr(state)
<< ", stats: " << Explain::statsToBSON(*stats);
uasserted(17144, "Executor error: " + WorkingSetCommon::toStatusString(obj));
}
// TODO: Currently, chunk ranges are kept around until all ClientCursors created while the
// chunk belonged on this node are gone. Separating chunk lifetime management from
// ClientCursor should allow this check to go away.
if (!shardingState->getVersion(nss.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
// if the version changed during the query we might be missing some data and its safe to
// send this as mongos can resend at this point
throw SendStaleConfigException(nss.ns(),
"version changed during initial query",
shardingVersionAtStart,
shardingState->getVersion(nss.ns()));
}
// Fill out curop based on query results. If we have a cursorid, we will fill out curop with
// this cursorid later.
long long ccId = 0;
if (shouldSaveCursor(txn, collection, state, exec.get())) {
// We won't use the executor until it's getMore'd.
exec->saveState();
exec->detachFromOperationContext();
// Allocate a new ClientCursor. We don't have to worry about leaking it as it's
// inserted into a global map by its ctor.
ClientCursor* cc =
new ClientCursor(collection->getCursorManager(),
exec.release(),
nss.ns(),
txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(),
pq.getOptions(),
pq.getFilter());
ccId = cc->cursorid();
LOG(5) << "caching executor with cursorid " << ccId << " after returning " << numResults
<< " results" << endl;
// TODO document
if (pq.isOplogReplay() && !slaveReadTill.isNull()) {
cc->slaveReadTill(slaveReadTill);
}
// TODO document
if (pq.isExhaust()) {
curop.debug().exhaust = true;
}
cc->setPos(numResults);
// If the query had a time limit, remaining time is "rolled over" to the cursor (for
// use by future getmore ops).
cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());
endQueryOp(txn, collection, *cc->getExecutor(), dbProfilingLevel, numResults, ccId);
} else {
LOG(5) << "Not caching executor but returning " << numResults << " results.\n";
endQueryOp(txn, collection, *exec, dbProfilingLevel, numResults, ccId);
}
// Add the results from the query into the output buffer.
result.appendData(bb.buf(), bb.len());
bb.decouple();
// Fill out the output buffer's header.
QueryResult::View qr = result.header().view2ptr();
qr.setCursorId(ccId);
qr.setResultFlagsToOk();
qr.msgdata().setOperation(opReply);
qr.setStartingFrom(0);
qr.setNReturned(numResults);
// curop.debug().exhaust is set above.
return curop.debug().exhaust ? nss.ns() : "";
}
} // namespace mongo