// instance.cpp : Global state variables and functions. // /** * Copyright (C) 2008 10gen Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ #include "stdafx.h" #include "db.h" #include "query.h" #include "introspect.h" #include "repl.h" #include "dbmessage.h" #include "instance.h" #include "lasterror.h" #include "security.h" #include "json.h" #include "reccache.h" #include "replset.h" #include "../s/d_logic.h" #include "../util/file_allocator.h" #include "cmdline.h" #if !defined(_WIN32) #include #endif namespace mongo { auto_ptr< QueryResult > runQuery(Message& m, QueryMessage& q, stringstream& ss ); void receivedKillCursors(Message& m); void receivedUpdate(Message& m, stringstream& ss); void receivedDelete(Message& m, stringstream& ss); void receivedInsert(Message& m, stringstream& ss); bool receivedGetMore(DbResponse& dbresponse, Message& m, stringstream& ss); CmdLine cmdLine; int nloggedsome = 0; #define LOGSOME if( ++nloggedsome < 1000 || nloggedsome % 100 == 0 ) SlaveTypes slave = NotSlave; bool master = false; // true means keep an op log bool autoresync = false; /* we use new here so we don't have to worry about destructor orders at program shutdown */ MongoMutex &dbMutex( *(new MongoMutex) ); // MutexInfo dbMutexInfo; string dbExecCommand; string bind_ip = ""; char *appsrvPath = null; DiagLog _diaglog; int opIdMem = 100000000; bool useCursors = true; bool useHints = true; void closeAllSockets(); void flushOpLog( stringstream &ss ) { if( _diaglog.f && _diaglog.f->is_open() ) { ss << "flushing op log and files\n"; _diaglog.flush(); } } int ctr = 0; KillCurrentOp killCurrentOp; int lockFile = 0; void inProgCmd( Message &m, DbResponse &dbresponse ) { BSONObjBuilder b; AuthenticationInfo *ai = cc().ai; if( !ai->isAuthorized("admin") ) { BSONObjBuilder b; b.append("err", "unauthorized"); } else { vector vals; { boostlock bl(Client::clientsMutex); for( set::iterator i = Client::clients.begin(); i != Client::clients.end(); i++ ) { Client *c = *i; CurOp& co = *(c->curop()); if( co.active() ) vals.push_back( co.infoNoauth() ); } } b.append("inprog", vals); } replyToQuery(0, m, dbresponse, b.obj()); } void killOp( Message &m, DbResponse &dbresponse ) { BSONObj obj; AuthenticationInfo *ai = currentClient.get()->ai; if( !ai->isAuthorized("admin") ) { obj = fromjson("{\"err\":\"unauthorized\"}"); } /*else if( !dbMutexInfo.isLocked() ) obj = fromjson("{\"info\":\"no op in progress/not locked\"}"); */ else { DbMessage d(m); QueryMessage q(d); BSONElement e = q.query.getField("op"); if( !e.isNumber() ) { obj = fromjson("{\"err\":\"no op number field specified?\"}"); } else { obj = fromjson("{\"info\":\"attempting to kill op\"}"); killCurrentOp.kill( (unsigned) e.number() ); } } replyToQuery(0, m, dbresponse, obj); } static bool receivedQuery(DbResponse& dbresponse, Message& m, stringstream& ss, bool logit, mongolock& lock ) { bool ok = true; MSGID responseTo = m.data->id; DbMessage d(m); QueryMessage q(d); QueryResult* msgdata; try { if (q.fields.get() && q.fields->errmsg) uassert( 10053 , q.fields->errmsg, false); /* note these are logged BEFORE authentication -- which is sort of ok */ if ( _diaglog.level && logit ) { if ( strstr(q.ns, ".$cmd") ) { /* $cmd queries are "commands" and usually best treated as write operations */ OPWRITE; } else { OPREAD; } } setClient( q.ns, dbpath, &lock ); Client& client = cc(); client.top.setRead(); client.curop()->setNS(q.ns); msgdata = runQuery(m, q, ss ).release(); } catch ( AssertionException& e ) { ok = false; ss << " exception "; LOGSOME problem() << " Caught Assertion in runQuery ns:" << q.ns << ' ' << e.toString() << '\n'; log() << " ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << '\n'; if ( q.query.valid() ) log() << " query:" << q.query.toString() << endl; else log() << " query object is not valid!" << endl; BSONObjBuilder err; err.append("$err", e.msg.empty() ? "assertion during query" : e.msg); BSONObj errObj = err.done(); BufBuilder b; b.skip(sizeof(QueryResult)); b.append((void*) errObj.objdata(), errObj.objsize()); // todo: call replyToQuery() from here instead of this!!! see dbmessage.h msgdata = (QueryResult *) b.buf(); b.decouple(); QueryResult *qr = msgdata; qr->resultFlags() = QueryResult::ResultFlag_ErrSet; qr->len = b.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; } Message *resp = new Message(); resp->setData(msgdata, true); // transport will free dbresponse.response = resp; dbresponse.responseTo = responseTo; Database *database = cc().database(); if ( database ) { if ( database->profile ) ss << " bytes:" << resp->data->dataLen(); } else { if ( strstr(q.ns, "$cmd") == 0 ) // (this condition is normal for $cmd dropDatabase) log() << "ERROR: receiveQuery: database is null; ns=" << q.ns << endl; } return ok; } bool commandIsReadOnly(BSONObj& _cmdobj); // Returns false when request includes 'end' bool assembleResponse( Message &m, DbResponse &dbresponse, const sockaddr_in &client ) { bool writeLock = true; // before we lock... int op = m.data->operation(); const char *ns = m.data->_data + 4; if ( op == dbQuery ) { if( strstr(ns, ".$cmd") ) { if( strstr(ns, ".$cmd.sys.") ) { if( strstr(ns, "$cmd.sys.inprog") ) { inProgCmd(m, dbresponse); return true; } if( strstr(ns, "$cmd.sys.killop") ) { killOp(m, dbresponse); return true; } } DbMessage d( m ); QueryMessage q( d ); writeLock = !commandIsReadOnly(q.query); } else writeLock = false; } else if( op == dbGetMore ) { writeLock = false; } if ( handlePossibleShardedMessage( m , dbresponse ) ){ /* important to do this before we lock so if a message has to be forwarded, doesn't block for that */ return true; } Client& c = cc(); c.clearns(); stringstream ss; CurOp& currentOp = *c.curop(); currentOp.reset( client); currentOp.setOp(op); int logThreshold = 100; bool log = logLevel >= 1; Timer t( currentOp.startTime() ); mongolock lk(writeLock); #if 0 /* use this if you only want to process operations for a particular namespace. maybe add to cmd line parms or something fancier. */ DbMessage ddd(m); if ( strncmp(ddd.getns(), "clusterstock", 12) != 0 ) { static int q; if ( ++q < 20 ) out() << "TEMP skip " << ddd.getns() << endl; goto skip; } #endif if ( op == dbQuery ) { // receivedQuery() does its own authorization processing. if ( ! receivedQuery(dbresponse, m, ss, true, lk) ) log = true; } else if ( op == dbGetMore ) { // does its own authorization processing. OPREAD; DEV log = true; ss << "getmore "; if ( ! receivedGetMore(dbresponse, m, ss) ) log = true; } else if ( op == dbMsg ) { /* deprecated / rarely used. intended for connection diagnostics. */ ss << "msg "; char *p = m.data->_data; int len = strlen(p); if ( len > 400 ) out() << curTimeMillis() % 10000 << " long msg received, len:" << len << " ends with: " << p + len - 10 << endl; bool end = false; //strcmp("end", p) == 0; Message *resp = new Message(); resp->setData(opReply, "i am fine"); dbresponse.response = resp; dbresponse.responseTo = m.data->id; //dbMsgPort.reply(m, resp); if ( end ) return false; } else { const char *ns = m.data->_data + 4; char cl[256]; nsToClient(ns, cl); currentOp.setNS(ns); AuthenticationInfo *ai = currentClient.get()->ai; if( !ai->isAuthorized(cl) ) { uassert_nothrow("unauthorized"); } else if ( op == dbInsert ) { OPWRITE; try { ss << "insert "; receivedInsert(m, ss); } catch ( AssertionException& e ) { LOGSOME problem() << " Caught Assertion insert, continuing\n"; ss << " exception " + e.toString(); log = true; } } else if ( op == dbUpdate ) { OPWRITE; try { ss << "update "; receivedUpdate(m, ss); } catch ( AssertionException& e ) { LOGSOME problem() << " Caught Assertion update, continuing" << endl; ss << " exception " + e.toString(); log = true; } } else if ( op == dbDelete ) { OPWRITE; try { ss << "remove "; receivedDelete(m, ss); } catch ( AssertionException& e ) { LOGSOME problem() << " Caught Assertion receivedDelete, continuing" << endl; ss << " exception " + e.toString(); log = true; } } else if ( op == dbKillCursors ) { OPREAD; try { logThreshold = 10; ss << "killcursors "; receivedKillCursors(m); } catch ( AssertionException& e ) { problem() << " Caught Assertion in kill cursors, continuing" << endl; ss << " exception " + e.toString(); log = true; } } else { out() << " operation isn't supported: " << op << endl; currentOp.setActive(false); assert(false); } } int ms = t.millis(); log = log || (logLevel >= 2 && ++ctr % 512 == 0); DEV log = true; if ( log || ms > logThreshold ) { ss << ' ' << ms << "ms"; mongo::log() << ss.str() << endl; } Database *database = cc().database(); if ( database && database->profile >= 1 ) { if ( database->profile >= 2 || ms >= 100 ) { // performance profiling is on if ( dbMutex.getState() > 1 || dbMutex.getState() < -1 ){ out() << "warning: not profiling because recursive lock" << endl; } else { string old_ns = cc().ns(); lk.releaseAndWriteLock(); resetClient(old_ns.c_str()); profile(ss.str().c_str(), ms); } } } currentOp.setActive(false); return true; } void killCursors(int n, long long *ids); void receivedKillCursors(Message& m) { int *x = (int *) m.data->_data; x++; // reserved int n = *x++; assert( n >= 1 ); if ( n > 2000 ) { problem() << "Assertion failure, receivedKillCursors, n=" << n << endl; assert( n < 30000 ); } killCursors(n, (long long *) x); } /* cl - database name path - db directory */ void closeDatabase( const char *cl, const string& path ) { Database *database = cc().database(); assert( database ); assert( database->name == cl ); /* if ( string("local") != cl ) { DBInfo i(cl); i.dbDropped(); }*/ /* important: kill all open cursors on the database */ string prefix(cl); prefix += '.'; ClientCursor::invalidate(prefix.c_str()); NamespaceDetailsTransient::clearForPrefix( prefix.c_str() ); eraseDatabase( cl, path ); delete database; // closes files cc().clearns(); } void receivedUpdate(Message& m, stringstream& ss) { DbMessage d(m); const char *ns = d.getns(); assert(*ns); uassert( 10054 , "not master", isMasterNs( ns ) ); setClient(ns); Client& client = cc(); client.top.setWrite(); ss << ns << ' '; int flags = d.pullInt(); BSONObj query = d.nextJsObj(); assert( d.moreJSObjs() ); assert( query.objsize() < m.data->dataLen() ); BSONObj toupdate = d.nextJsObj(); uassert( 10055 , "update object too large", toupdate.objsize() <= MaxBSONObjectSize); assert( toupdate.objsize() < m.data->dataLen() ); assert( query.objsize() + toupdate.objsize() < m.data->dataLen() ); bool upsert = flags & Option_Upsert; bool multi = flags & Option_Multi; { string s = query.toString(); /* todo: we shouldn't do all this ss stuff when we don't need it, it will slow us down. */ ss << " query: " << s; CurOp& currentOp = *client.curop(); currentOp.setQuery(query); } UpdateResult res = updateObjects(ns, toupdate, query, upsert, multi, ss, true); /* TODO FIX: recordUpdate should take a long int for parm #2 */ recordUpdate( res.existing , (int) res.num ); // for getlasterror } void receivedDelete(Message& m, stringstream &ss) { DbMessage d(m); const char *ns = d.getns(); assert(*ns); uassert( 10056 , "not master", isMasterNs( ns ) ); setClient(ns); Client& client = cc(); client.top.setWrite(); int flags = d.pullInt(); bool justOne = flags & 1; assert( d.moreJSObjs() ); BSONObj pattern = d.nextJsObj(); { string s = pattern.toString(); ss << " query: " << s; CurOp& currentOp = *client.curop(); currentOp.setQuery(pattern); } int n = deleteObjects(ns, pattern, justOne, true); recordDelete( n ); } QueryResult* emptyMoreResult(long long); bool receivedGetMore(DbResponse& dbresponse, /*AbstractMessagingPort& dbMsgPort, */Message& m, stringstream& ss) { bool ok = true; DbMessage d(m); const char *ns = d.getns(); ss << ns; setClient(ns); cc().top.setRead(); int ntoreturn = d.pullInt(); long long cursorid = d.pullInt64(); ss << " cid:" << cursorid; ss << " ntoreturn:" << ntoreturn; QueryResult* msgdata; try { AuthenticationInfo *ai = currentClient.get()->ai; uassert( 10057 , "unauthorized", ai->isAuthorized(cc().database()->name.c_str())); msgdata = getMore(ns, ntoreturn, cursorid, ss); } catch ( AssertionException& e ) { ss << " exception " + e.toString(); msgdata = emptyMoreResult(cursorid); ok = false; } Message *resp = new Message(); resp->setData(msgdata, true); ss << " bytes:" << resp->data->dataLen(); ss << " nreturned:" << msgdata->nReturned; dbresponse.response = resp; dbresponse.responseTo = m.data->id; //dbMsgPort.reply(m, resp); return ok; } void receivedInsert(Message& m, stringstream& ss) { DbMessage d(m); const char *ns = d.getns(); assert(*ns); uassert( 10058 , "not master", isMasterNs( ns ) ); setClient(ns); cc().top.setWrite(); ss << ns; while ( d.moreJSObjs() ) { BSONObj js = d.nextJsObj(); uassert( 10059 , "object to insert too large", js.objsize() <= MaxBSONObjectSize); theDataFileMgr.insert(ns, js, false); logOp("i", ns, js); } } class JniMessagingPort : public AbstractMessagingPort { public: JniMessagingPort(Message& _container) : container(_container) { } void reply(Message& received, Message& response, MSGID) { container = response; } void reply(Message& received, Message& response) { container = response; } unsigned remotePort(){ return 1; } Message & container; }; void getDatabaseNames( vector< string > &names ) { boost::filesystem::path path( dbpath ); for ( boost::filesystem::directory_iterator i( path ); i != boost::filesystem::directory_iterator(); ++i ) { string fileName = boost::filesystem::path(*i).leaf(); if ( fileName.length() > 3 && fileName.substr( fileName.length() - 3, 3 ) == ".ns" ) names.push_back( fileName.substr( 0, fileName.length() - 3 ) ); } } bool DBDirectClient::call( Message &toSend, Message &response, bool assertOk ) { SavedContext c; DbResponse dbResponse; assembleResponse( toSend, dbResponse ); assert( dbResponse.response ); response = *dbResponse.response; return true; } void DBDirectClient::say( Message &toSend ) { SavedContext c; DbResponse dbResponse; assembleResponse( toSend, dbResponse ); } auto_ptr DBDirectClient::query(const string &ns, Query query, int nToReturn , int nToSkip , const BSONObj *fieldsToReturn , int queryOptions ){ //if ( ! query.obj.isEmpty() || nToReturn != 0 || nToSkip != 0 || fieldsToReturn || queryOptions ) return DBClientBase::query( ns , query , nToReturn , nToSkip , fieldsToReturn , queryOptions ); // //assert( query.obj.isEmpty() ); //throw UserException( (string)"yay:" + ns ); } DBDirectClient::AlwaysAuthorized DBDirectClient::SavedContext::always; DBClientBase * createDirectClient(){ return new DBDirectClient(); } void recCacheCloseAll(); boost::mutex &listenerSocketMutex( *( new boost::mutex ) ); vector< int > listenerSockets; void registerListenerSocket( int socket ) { boostlock lk( listenerSocketMutex ); listenerSockets.push_back( socket ); } boost::mutex &exitMutex( *( new boost::mutex ) ); int numExitCalls = 0; void shutdown(); bool inShutdown(){ return numExitCalls > 0; } void tryToOutputFatal( const string& s ){ try { rawOut( s ); return; } catch ( ... ){} try { cerr << s << endl; return; } catch ( ... ){} // uh - oh, not sure there is anything else we can do... } /* not using log() herein in case we are already locked */ void dbexit( ExitCode rc, const char *why) { { boostlock lk( exitMutex ); if ( numExitCalls++ > 0 ) { if ( numExitCalls > 5 ){ // this means something horrible has happened ::_exit( rc ); } stringstream ss; ss << "dbexit: " << why << "; exiting immediately" << endl; tryToOutputFatal( ss.str() ); ::exit( rc ); } } stringstream ss; ss << "dbexit: " << why << endl; tryToOutputFatal( ss.str() ); try { shutdown(); // gracefully shutdown instance } catch ( ... ){ tryToOutputFatal( "shutdown failed with exception" ); } tryToOutputFatal( "dbexit: really exiting now\n" ); ::exit(rc); } void shutdown() { #ifndef _WIN32 { // close listener sockets // We would only hang here if a synchronous signal is received // during a registerListenerSocket() call, which we don't expect. boostlock lk( listenerSocketMutex ); for( vector< int >::iterator i = listenerSockets.begin(); i != listenerSockets.end(); ++i ) close( *i ); } #endif log() << "\t shutdown: going to flush oplog..." << endl; stringstream ss2; flushOpLog( ss2 ); rawOut( ss2.str() ); /* must do this before unmapping mem or you may get a seg fault */ log() << "\t shutdown: going to close sockets..." << endl; closeAllSockets(); // wait until file preallocation finishes // we would only hang here if the file_allocator code generates a // synchronous signal, which we don't expect log() << "\t shutdown: waiting for fs..." << endl; theFileAllocator().waitUntilFinished(); log() << "\t shutdown: closing all files..." << endl; stringstream ss3; MemoryMappedFile::closeAllFiles( ss3 ); rawOut( ss3.str() ); // should we be locked here? we aren't. might be ok as-is. recCacheCloseAll(); #if !defined(_WIN32) && !defined(__sunos__) if ( lockFile ){ log() << "\t shutdown: removing fs lock..." << endl; if( ftruncate( lockFile , 0 ) ) log() << "\t couldn't remove fs lock " << OUTPUT_ERRNO << endl; flock( lockFile, LOCK_UN ); } #endif } void acquirePathLock() { #if !defined(_WIN32) && !defined(__sunos__) string name = ( boost::filesystem::path( dbpath ) / "mongod.lock" ).native_file_string(); lockFile = open( name.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRWXU | S_IRWXG | S_IRWXO ); massert( 10309 , "Unable to create / open lock file for dbpath: " + name, lockFile > 0 ); massert( 10310 , "Unable to acquire lock for dbpath: " + name, flock( lockFile, LOCK_EX | LOCK_NB ) == 0 ); stringstream ss; ss << getpid() << endl; string s = ss.str(); const char * data = s.c_str(); assert( write( lockFile , data , strlen( data ) ) ); fsync( lockFile ); #endif } } // namespace mongo