summaryrefslogtreecommitdiff
path: root/src/mongo/db
diff options
context:
space:
mode:
authorEliot Horowitz <eliot@10gen.com>2011-12-24 15:33:26 -0500
committerEliot Horowitz <eliot@10gen.com>2011-12-24 15:33:45 -0500
commitae1ecd9c786911f9f1f0242f0f7d702b3e5dfeba (patch)
tree92f8e1649e6f080b251ff5f1763679a72eb59b34 /src/mongo/db
parentdfa4cd7e2cf109b072440155fabc08a93c8045a0 (diff)
downloadmongo-ae1ecd9c786911f9f1f0242f0f7d702b3e5dfeba.tar.gz
bulk move of code to src/ SERVER-4551
Diffstat (limited to 'src/mongo/db')
-rw-r--r--src/mongo/db/background.h56
-rw-r--r--src/mongo/db/btree.cpp1980
-rw-r--r--src/mongo/db/btree.h1174
-rw-r--r--src/mongo/db/btreebuilder.cpp184
-rw-r--r--src/mongo/db/btreebuilder.h53
-rw-r--r--src/mongo/db/btreecursor.cpp457
-rw-r--r--src/mongo/db/cap.cpp457
-rw-r--r--src/mongo/db/client.cpp697
-rw-r--r--src/mongo/db/client.h286
-rw-r--r--src/mongo/db/client_common.h47
-rw-r--r--src/mongo/db/clientcursor.cpp747
-rw-r--r--src/mongo/db/clientcursor.h430
-rw-r--r--src/mongo/db/cloner.cpp763
-rw-r--r--src/mongo/db/cloner.h39
-rw-r--r--src/mongo/db/cmdline.cpp519
-rw-r--r--src/mongo/db/cmdline.h203
-rw-r--r--src/mongo/db/collection.h15
-rwxr-xr-xsrc/mongo/db/commands.cpp209
-rw-r--r--src/mongo/db/commands.h164
-rwxr-xr-xsrc/mongo/db/commands/aggregate.js184
-rw-r--r--src/mongo/db/commands/cloud.cpp90
-rw-r--r--src/mongo/db/commands/distinct.cpp157
-rwxr-xr-xsrc/mongo/db/commands/document_source_cursor.cpp100
-rw-r--r--src/mongo/db/commands/find_and_modify.cpp153
-rw-r--r--src/mongo/db/commands/group.cpp224
-rw-r--r--src/mongo/db/commands/isself.cpp246
-rw-r--r--src/mongo/db/commands/mr.cpp1317
-rw-r--r--src/mongo/db/commands/mr.h319
-rwxr-xr-xsrc/mongo/db/commands/pipeline.cpp405
-rwxr-xr-xsrc/mongo/db/commands/pipeline.h183
-rwxr-xr-xsrc/mongo/db/commands/pipeline_command.cpp187
-rw-r--r--src/mongo/db/common.cpp73
-rw-r--r--src/mongo/db/compact.cpp376
-rw-r--r--src/mongo/db/compact.h50
-rw-r--r--src/mongo/db/concurrency.h21
-rw-r--r--src/mongo/db/curop-inl.h1
-rw-r--r--src/mongo/db/curop.cpp173
-rw-r--r--src/mongo/db/curop.h313
-rw-r--r--src/mongo/db/cursor.cpp166
-rw-r--r--src/mongo/db/cursor.h246
-rwxr-xr-xsrc/mongo/db/d_concurrency.cpp231
-rw-r--r--src/mongo/db/d_concurrency.h67
-rw-r--r--src/mongo/db/d_globals.cpp20
-rw-r--r--src/mongo/db/d_globals.h27
-rw-r--r--src/mongo/db/database.cpp423
-rw-r--r--src/mongo/db/database.h145
-rw-r--r--src/mongo/db/databaseholder.h126
-rw-r--r--src/mongo/db/db.cpp1309
-rw-r--r--src/mongo/db/db.h120
-rwxr-xr-xsrc/mongo/db/db.rc12
-rwxr-xr-xsrc/mongo/db/db.vcxproj934
-rwxr-xr-xsrc/mongo/db/db.vcxproj.filters432
-rwxr-xr-xsrc/mongo/db/db_10.sln168
-rw-r--r--src/mongo/db/dbcommands.cpp1955
-rw-r--r--src/mongo/db/dbcommands_admin.cpp550
-rw-r--r--src/mongo/db/dbcommands_generic.cpp432
-rw-r--r--src/mongo/db/dbeval.cpp136
-rw-r--r--src/mongo/db/dbhelpers.cpp353
-rw-r--r--src/mongo/db/dbhelpers.h159
-rw-r--r--src/mongo/db/dbmessage.cpp108
-rw-r--r--src/mongo/db/dbmessage.h282
-rw-r--r--src/mongo/db/dbwebserver.cpp539
-rw-r--r--src/mongo/db/dbwebserver.h85
-rw-r--r--src/mongo/db/diskloc.h160
-rw-r--r--src/mongo/db/driverHelpers.cpp62
-rw-r--r--src/mongo/db/dur.cpp840
-rw-r--r--src/mongo/db/dur.h209
-rw-r--r--src/mongo/db/dur_commitjob.cpp240
-rw-r--r--src/mongo/db/dur_commitjob.h220
-rw-r--r--src/mongo/db/dur_journal.cpp748
-rw-r--r--src/mongo/db/dur_journal.h68
-rw-r--r--src/mongo/db/dur_journalformat.h174
-rw-r--r--src/mongo/db/dur_journalimpl.h103
-rw-r--r--src/mongo/db/dur_preplogbuffer.cpp177
-rw-r--r--src/mongo/db/dur_recover.cpp544
-rw-r--r--src/mongo/db/dur_recover.h50
-rw-r--r--src/mongo/db/dur_stats.h49
-rw-r--r--src/mongo/db/dur_writetodatafiles.cpp94
-rw-r--r--src/mongo/db/durop.cpp161
-rw-r--r--src/mongo/db/durop.h109
-rw-r--r--src/mongo/db/extsort.cpp245
-rw-r--r--src/mongo/db/extsort.h150
-rw-r--r--src/mongo/db/filever.h30
-rw-r--r--src/mongo/db/flushtest.cpp150
-rw-r--r--src/mongo/db/geo/2d.cpp3289
-rw-r--r--src/mongo/db/geo/core.h550
-rw-r--r--src/mongo/db/geo/haystack.cpp318
-rw-r--r--src/mongo/db/globals.h54
-rw-r--r--src/mongo/db/helpers/dblogger.h31
-rw-r--r--src/mongo/db/index.cpp446
-rw-r--r--src/mongo/db/index.h237
-rw-r--r--src/mongo/db/indexkey.cpp462
-rw-r--r--src/mongo/db/indexkey.h198
-rw-r--r--src/mongo/db/instance.cpp1148
-rw-r--r--src/mongo/db/instance.h174
-rw-r--r--src/mongo/db/introspect.cpp88
-rw-r--r--src/mongo/db/introspect.h34
-rw-r--r--src/mongo/db/javatest.cpp24
-rw-r--r--src/mongo/db/jsobj.cpp1268
-rw-r--r--src/mongo/db/jsobj.h47
-rw-r--r--src/mongo/db/jsobjmanipulator.h94
-rw-r--r--src/mongo/db/json.cpp651
-rw-r--r--src/mongo/db/json.h41
-rw-r--r--src/mongo/db/key.cpp678
-rw-r--r--src/mongo/db/key.h115
-rw-r--r--src/mongo/db/lasterror.cpp142
-rw-r--r--src/mongo/db/lasterror.h146
-rwxr-xr-xsrc/mongo/db/matcher.cpp1128
-rw-r--r--src/mongo/db/matcher.h276
-rw-r--r--src/mongo/db/matcher_covered.cpp101
-rw-r--r--src/mongo/db/minilex.h164
-rw-r--r--src/mongo/db/module.cpp68
-rw-r--r--src/mongo/db/module.h70
-rw-r--r--src/mongo/db/modules/mms.cpp170
-rwxr-xr-xsrc/mongo/db/mongo.icobin0 -> 51262 bytes
-rw-r--r--src/mongo/db/mongommf.cpp339
-rw-r--r--src/mongo/db/mongommf.h145
-rw-r--r--src/mongo/db/mongomutex.h388
-rw-r--r--src/mongo/db/namespace-inl.h132
-rw-r--r--src/mongo/db/namespace.cpp800
-rw-r--r--src/mongo/db/namespace.h629
-rw-r--r--src/mongo/db/namespacestring.h147
-rw-r--r--src/mongo/db/nonce.cpp95
-rw-r--r--src/mongo/db/nonce.h36
-rw-r--r--src/mongo/db/oplog.cpp872
-rw-r--r--src/mongo/db/oplog.h149
-rw-r--r--src/mongo/db/oplogreader.h121
-rw-r--r--src/mongo/db/ops/count.cpp103
-rw-r--r--src/mongo/db/ops/count.h30
-rw-r--r--src/mongo/db/ops/delete.cpp158
-rw-r--r--src/mongo/db/ops/delete.h33
-rw-r--r--src/mongo/db/ops/query.cpp870
-rw-r--r--src/mongo/db/ops/query.h248
-rw-r--r--src/mongo/db/ops/update.cpp1308
-rw-r--r--src/mongo/db/ops/update.h700
-rw-r--r--src/mongo/db/pagefault.cpp55
-rw-r--r--src/mongo/db/pagefault.h46
-rw-r--r--src/mongo/db/pcre.txt15
-rw-r--r--src/mongo/db/pdfile.cpp2425
-rw-r--r--src/mongo/db/pdfile.h546
-rwxr-xr-xsrc/mongo/db/pipeline/accumulator.cpp92
-rwxr-xr-xsrc/mongo/db/pipeline/accumulator.h259
-rwxr-xr-xsrc/mongo/db/pipeline/accumulator_add_to_set.cpp79
-rwxr-xr-xsrc/mongo/db/pipeline/accumulator_avg.cpp123
-rwxr-xr-xsrc/mongo/db/pipeline/accumulator_first.cpp49
-rwxr-xr-xsrc/mongo/db/pipeline/accumulator_last.cpp48
-rwxr-xr-xsrc/mongo/db/pipeline/accumulator_min_max.cpp67
-rwxr-xr-xsrc/mongo/db/pipeline/accumulator_push.cpp73
-rwxr-xr-xsrc/mongo/db/pipeline/accumulator_single_value.cpp32
-rwxr-xr-xsrc/mongo/db/pipeline/accumulator_sum.cpp74
-rwxr-xr-xsrc/mongo/db/pipeline/builder.cpp117
-rwxr-xr-xsrc/mongo/db/pipeline/builder.h95
-rwxr-xr-xsrc/mongo/db/pipeline/doc_mem_monitor.cpp68
-rwxr-xr-xsrc/mongo/db/pipeline/doc_mem_monitor.h94
-rwxr-xr-xsrc/mongo/db/pipeline/document.cpp219
-rwxr-xr-xsrc/mongo/db/pipeline/document.h246
-rwxr-xr-xsrc/mongo/db/pipeline/document_source.cpp52
-rwxr-xr-xsrc/mongo/db/pipeline/document_source.h985
-rwxr-xr-xsrc/mongo/db/pipeline/document_source_bson_array.cpp83
-rwxr-xr-xsrc/mongo/db/pipeline/document_source_command_futures.cpp132
-rwxr-xr-xsrc/mongo/db/pipeline/document_source_filter.cpp98
-rwxr-xr-xsrc/mongo/db/pipeline/document_source_filter_base.cpp85
-rwxr-xr-xsrc/mongo/db/pipeline/document_source_group.cpp391
-rw-r--r--src/mongo/db/pipeline/document_source_limit.cpp83
-rwxr-xr-xsrc/mongo/db/pipeline/document_source_match.cpp80
-rwxr-xr-xsrc/mongo/db/pipeline/document_source_out.cpp56
-rwxr-xr-xsrc/mongo/db/pipeline/document_source_project.cpp201
-rw-r--r--src/mongo/db/pipeline/document_source_skip.cpp99
-rwxr-xr-xsrc/mongo/db/pipeline/document_source_sort.cpp216
-rwxr-xr-xsrc/mongo/db/pipeline/document_source_unwind.cpp234
-rwxr-xr-xsrc/mongo/db/pipeline/expression.cpp2815
-rwxr-xr-xsrc/mongo/db/pipeline/expression.h1223
-rwxr-xr-xsrc/mongo/db/pipeline/expression_context.cpp35
-rwxr-xr-xsrc/mongo/db/pipeline/expression_context.h67
-rwxr-xr-xsrc/mongo/db/pipeline/field_path.cpp87
-rwxr-xr-xsrc/mongo/db/pipeline/field_path.h82
-rwxr-xr-xsrc/mongo/db/pipeline/value.cpp1034
-rwxr-xr-xsrc/mongo/db/pipeline/value.h468
-rw-r--r--src/mongo/db/projection.cpp301
-rw-r--r--src/mongo/db/projection.h129
-rw-r--r--src/mongo/db/queryoptimizer.cpp1337
-rw-r--r--src/mongo/db/queryoptimizer.h599
-rw-r--r--src/mongo/db/queryoptimizercursor.cpp530
-rw-r--r--src/mongo/db/queryoptimizercursor.h150
-rw-r--r--src/mongo/db/querypattern.cpp99
-rw-r--r--src/mongo/db/querypattern.h78
-rw-r--r--src/mongo/db/queryutil-inl.h153
-rw-r--r--src/mongo/db/queryutil.cpp1551
-rw-r--r--src/mongo/db/queryutil.h443
-rw-r--r--src/mongo/db/record.cpp267
-rw-r--r--src/mongo/db/repl.cpp1516
-rw-r--r--src/mongo/db/repl.h199
-rw-r--r--src/mongo/db/repl/connections.h128
-rw-r--r--src/mongo/db/repl/consensus.cpp449
-rw-r--r--src/mongo/db/repl/health.cpp449
-rw-r--r--src/mongo/db/repl/health.h50
-rw-r--r--src/mongo/db/repl/heartbeat.cpp382
-rw-r--r--src/mongo/db/repl/manager.cpp274
-rw-r--r--src/mongo/db/repl/multicmd.h75
-rw-r--r--src/mongo/db/repl/replset_commands.cpp404
-rw-r--r--src/mongo/db/repl/rs.cpp778
-rw-r--r--src/mongo/db/repl/rs.h667
-rw-r--r--src/mongo/db/repl/rs_config.cpp662
-rw-r--r--src/mongo/db/repl/rs_config.h251
-rw-r--r--src/mongo/db/repl/rs_exception.h17
-rw-r--r--src/mongo/db/repl/rs_initialsync.cpp271
-rw-r--r--src/mongo/db/repl/rs_initiate.cpp269
-rw-r--r--src/mongo/db/repl/rs_member.h131
-rw-r--r--src/mongo/db/repl/rs_optime.h58
-rw-r--r--src/mongo/db/repl/rs_rollback.cpp667
-rw-r--r--src/mongo/db/repl/rs_sync.cpp701
-rw-r--r--src/mongo/db/repl/test.html11
-rw-r--r--src/mongo/db/repl/testing.js42
-rw-r--r--src/mongo/db/repl_block.cpp256
-rw-r--r--src/mongo/db/repl_block.h39
-rw-r--r--src/mongo/db/replutil.h102
-rw-r--r--src/mongo/db/resource.h16
-rw-r--r--src/mongo/db/restapi.cpp294
-rw-r--r--src/mongo/db/restapi.h34
-rw-r--r--src/mongo/db/scanandorder.cpp105
-rw-r--r--src/mongo/db/scanandorder.h111
-rw-r--r--src/mongo/db/security.cpp106
-rwxr-xr-xsrc/mongo/db/security.h113
-rw-r--r--src/mongo/db/security_commands.cpp150
-rw-r--r--src/mongo/db/security_common.cpp148
-rw-r--r--src/mongo/db/security_common.h85
-rw-r--r--src/mongo/db/stats/counters.cpp207
-rw-r--r--src/mongo/db/stats/counters.h159
-rw-r--r--src/mongo/db/stats/fine_clock.h67
-rw-r--r--src/mongo/db/stats/service_stats.cpp68
-rw-r--r--src/mongo/db/stats/service_stats.h66
-rw-r--r--src/mongo/db/stats/snapshots.cpp227
-rw-r--r--src/mongo/db/stats/snapshots.h114
-rw-r--r--src/mongo/db/stats/top.cpp183
-rw-r--r--src/mongo/db/stats/top.h247
-rw-r--r--src/mongo/db/taskqueue.h106
-rw-r--r--src/mongo/db/tests.cpp68
237 files changed, 76762 insertions, 0 deletions
diff --git a/src/mongo/db/background.h b/src/mongo/db/background.h
new file mode 100644
index 00000000000..ea424c97107
--- /dev/null
+++ b/src/mongo/db/background.h
@@ -0,0 +1,56 @@
+/**
+* Copyright (C) 2010 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* background.h
+
+ Concurrency coordination for administrative operations.
+*/
+
+#pragma once
+
+namespace mongo {
+
+ /* these are administrative operations / jobs
+ for a namespace running in the background, and that only one
+ at a time per namespace is permitted, and that if in progress,
+ you aren't allowed to do other NamespaceDetails major manipulations
+ (such as dropping ns or db) even in the foreground and must
+ instead uassert.
+
+ It's assumed this is not for super-high RPS things, so we don't do
+ anything special in the implementation here to be fast.
+ */
+ class BackgroundOperation : public boost::noncopyable {
+ public:
+ static bool inProgForDb(const char *db);
+ static bool inProgForNs(const char *ns);
+ static void assertNoBgOpInProgForDb(const char *db);
+ static void assertNoBgOpInProgForNs(const char *ns);
+ static void dump(stringstream&);
+
+ /* check for in progress before instantiating */
+ BackgroundOperation(const char *ns);
+
+ virtual ~BackgroundOperation();
+
+ private:
+ NamespaceString _ns;
+ static map<string, unsigned> dbsInProg;
+ static set<string> nsInProg;
+ };
+
+} // namespace mongo
+
diff --git a/src/mongo/db/btree.cpp b/src/mongo/db/btree.cpp
new file mode 100644
index 00000000000..5c55fad33c3
--- /dev/null
+++ b/src/mongo/db/btree.cpp
@@ -0,0 +1,1980 @@
+// btree.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "db.h"
+#include "btree.h"
+#include "pdfile.h"
+#include "json.h"
+#include "clientcursor.h"
+#include "client.h"
+#include "dbhelpers.h"
+#include "curop-inl.h"
+#include "stats/counters.h"
+#include "dur_commitjob.h"
+#include "btreebuilder.h"
+#include "../util/unittest.h"
+#include "../server.h"
+
+namespace mongo {
+
+ BOOST_STATIC_ASSERT( Record::HeaderSize == 16 );
+ BOOST_STATIC_ASSERT( Record::HeaderSize + BtreeData_V1::BucketSize == 8192 );
+
+ NOINLINE_DECL void checkFailed(unsigned line) {
+ static time_t last;
+ if( time(0) - last >= 10 ) {
+ msgasserted(15898, str::stream() << "error in index possibly corruption consider repairing " << line);
+ }
+ }
+
+ /** data check. like assert, but gives a reasonable error message to the user. */
+#define check(expr) if(!(expr) ) { checkFailed(__LINE__); }
+
+#define VERIFYTHISLOC dassert( thisLoc.btree<V>() == this );
+
+ template< class Loc >
+ __KeyNode<Loc> & __KeyNode<Loc>::writing() const {
+ return *getDur().writing( const_cast< __KeyNode<Loc> * >( this ) );
+ }
+
+ // BucketBasics::lowWaterMark()
+ //
+ // We define this value as the maximum number of bytes such that, if we have
+ // fewer than this many bytes, we must be able to either merge with or receive
+ // keys from any neighboring node. If our utilization goes below this value we
+ // know we can bring up the utilization with a simple operation. Ignoring the
+ // 90/10 split policy which is sometimes employed and our 'unused' nodes, this
+ // is a lower bound on bucket utilization for non root buckets.
+ //
+ // Note that the exact value here depends on the implementation of
+ // rebalancedSeparatorPos(). The conditions for lowWaterMark - 1 are as
+ // follows: We know we cannot merge with the neighbor, so the total data size
+ // for us, the neighbor, and the separator must be at least
+ // BtreeBucket<V>::bodySize() + 1. We must be able to accept one key of any
+ // allowed size, so our size plus storage for that additional key must be
+ // <= BtreeBucket<V>::bodySize() / 2. This way, with the extra key we'll have a
+ // new bucket data size < half the total data size and by the implementation
+ // of rebalancedSeparatorPos() the key must be added.
+
+ static const int split_debug = 0;
+ static const int insert_debug = 0;
+
+ /**
+ * this error is ok/benign when doing a background indexing -- that logic in pdfile checks explicitly
+ * for the 10287 error code.
+ */
+ static void alreadyInIndex() {
+ // we don't use massert() here as that does logging and this is 'benign' - see catches in _indexRecord()
+ throw MsgAssertionException(10287, "btree: key+recloc already in index");
+ }
+
+ /* BucketBasics --------------------------------------------------- */
+
+ template< class V >
+ void BucketBasics<V>::assertWritable() {
+ if( cmdLine.dur )
+ dur::assertAlreadyDeclared(this, V::BucketSize);
+ }
+
+ template< class V >
+ string BtreeBucket<V>::bucketSummary() const {
+ stringstream ss;
+ ss << " Bucket info:" << endl;
+ ss << " n: " << this->n << endl;
+ ss << " parent: " << this->parent.toString() << endl;
+ ss << " nextChild: " << this->nextChild.toString() << endl;
+ ss << " flags:" << this->flags << endl;
+ ss << " emptySize: " << this->emptySize << " topSize: " << this->topSize << endl;
+ return ss.str();
+ }
+
+ template< class V >
+ int BucketBasics<V>::Size() const {
+ return V::BucketSize;
+ }
+
+ template< class V >
+ void BucketBasics<V>::_shape(int level, stringstream& ss) const {
+ for ( int i = 0; i < level; i++ ) ss << ' ';
+ ss << "*[" << this->n << "]\n";
+ for ( int i = 0; i < this->n; i++ ) {
+ if ( !k(i).prevChildBucket.isNull() ) {
+ DiskLoc ll = k(i).prevChildBucket;
+ ll.btree<V>()->_shape(level+1,ss);
+ }
+ }
+ if ( !this->nextChild.isNull() ) {
+ DiskLoc ll = this->nextChild;
+ ll.btree<V>()->_shape(level+1,ss);
+ }
+ }
+
+ int bt_fv=0;
+ int bt_dmp=0;
+
+ template< class V >
+ void BtreeBucket<V>::dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const {
+ bt_dmp=1;
+ fullValidate(thisLoc, order);
+ bt_dmp=0;
+ }
+
+ template< class V >
+ long long BtreeBucket<V>::fullValidate(const DiskLoc& thisLoc, const BSONObj &order, long long *unusedCount, bool strict, unsigned depth) const {
+ {
+ bool f = false;
+ assert( f = true );
+ massert( 10281 , "assert is misdefined", f);
+ }
+
+ killCurrentOp.checkForInterrupt();
+ this->assertValid(order, true);
+
+ if ( bt_dmp ) {
+ _log() << thisLoc.toString() << ' ';
+ ((BtreeBucket *) this)->dump(depth);
+ }
+
+ // keycount
+ long long kc = 0;
+
+ for ( int i = 0; i < this->n; i++ ) {
+ const _KeyNode& kn = this->k(i);
+
+ if ( kn.isUsed() ) {
+ kc++;
+ }
+ else {
+ if ( unusedCount ) {
+ ++( *unusedCount );
+ }
+ }
+ if ( !kn.prevChildBucket.isNull() ) {
+ DiskLoc left = kn.prevChildBucket;
+ const BtreeBucket *b = left.btree<V>();
+ if ( strict ) {
+ assert( b->parent == thisLoc );
+ }
+ else {
+ wassert( b->parent == thisLoc );
+ }
+ kc += b->fullValidate(kn.prevChildBucket, order, unusedCount, strict, depth+1);
+ }
+ }
+ if ( !this->nextChild.isNull() ) {
+ DiskLoc ll = this->nextChild;
+ const BtreeBucket *b = ll.btree<V>();
+ if ( strict ) {
+ assert( b->parent == thisLoc );
+ }
+ else {
+ wassert( b->parent == thisLoc );
+ }
+ kc += b->fullValidate(this->nextChild, order, unusedCount, strict, depth+1);
+ }
+
+ return kc;
+ }
+
+ int nDumped = 0;
+
+ template< class V >
+ void BucketBasics<V>::assertValid(const Ordering &order, bool force) const {
+ if ( !debug && !force )
+ return;
+ {
+ int foo = this->n;
+ wassert( foo >= 0 && this->n < Size() );
+ foo = this->emptySize;
+ wassert( foo >= 0 && this->emptySize < V::BucketSize );
+ wassert( this->topSize >= this->n && this->topSize <= V::BucketSize );
+ }
+
+ // this is very slow so don't do often
+ {
+ static int _k;
+ if( ++_k % 128 )
+ return;
+ }
+
+ DEV {
+ // slow:
+ for ( int i = 0; i < this->n-1; i++ ) {
+ Key k1 = keyNode(i).key;
+ Key k2 = keyNode(i+1).key;
+ int z = k1.woCompare(k2, order); //OK
+ if ( z > 0 ) {
+ out() << "ERROR: btree key order corrupt. Keys:" << endl;
+ if ( ++nDumped < 5 ) {
+ for ( int j = 0; j < this->n; j++ ) {
+ out() << " " << keyNode(j).key.toString() << endl;
+ }
+ ((BtreeBucket<V> *) this)->dump();
+ }
+ wassert(false);
+ break;
+ }
+ else if ( z == 0 ) {
+ if ( !(k(i).recordLoc < k(i+1).recordLoc) ) {
+ out() << "ERROR: btree key order corrupt (recordloc's wrong):" << endl;
+ out() << " k(" << i << ")" << keyNode(i).key.toString() << " RL:" << k(i).recordLoc.toString() << endl;
+ out() << " k(" << i+1 << ")" << keyNode(i+1).key.toString() << " RL:" << k(i+1).recordLoc.toString() << endl;
+ wassert( k(i).recordLoc < k(i+1).recordLoc );
+ }
+ }
+ }
+ }
+ else {
+ //faster:
+ if ( this->n > 1 ) {
+ Key k1 = keyNode(0).key;
+ Key k2 = keyNode(this->n-1).key;
+ int z = k1.woCompare(k2, order);
+ //wassert( z <= 0 );
+ if ( z > 0 ) {
+ problem() << "btree keys out of order" << '\n';
+ ONCE {
+ ((BtreeBucket<V> *) this)->dump();
+ }
+ assert(false);
+ }
+ }
+ }
+ }
+
+ template< class V >
+ inline void BucketBasics<V>::markUnused(int keypos) {
+ assert( keypos >= 0 && keypos < this->n );
+ k(keypos).setUnused();
+ }
+
+ template< class V >
+ inline int BucketBasics<V>::totalDataSize() const {
+ return (int) (Size() - (this->data-(char*)this));
+ }
+
+ template< class V >
+ void BucketBasics<V>::init() {
+ this->_init();
+ this->parent.Null();
+ this->nextChild.Null();
+ this->flags = Packed;
+ this->n = 0;
+ this->emptySize = totalDataSize();
+ this->topSize = 0;
+ }
+
+ /** see _alloc */
+ template< class V >
+ inline void BucketBasics<V>::_unalloc(int bytes) {
+ this->topSize -= bytes;
+ this->emptySize += bytes;
+ }
+
+ /**
+ * we allocate space from the end of the buffer for data.
+ * the keynodes grow from the front.
+ */
+ template< class V >
+ inline int BucketBasics<V>::_alloc(int bytes) {
+ assert( this->emptySize >= bytes );
+ this->topSize += bytes;
+ this->emptySize -= bytes;
+ int ofs = totalDataSize() - this->topSize;
+ assert( ofs > 0 );
+ return ofs;
+ }
+
+ template< class V >
+ void BucketBasics<V>::_delKeyAtPos(int keypos, bool mayEmpty) {
+ // TODO This should be keypos < n
+ assert( keypos >= 0 && keypos <= this->n );
+ assert( childForPos(keypos).isNull() );
+ // TODO audit cases where nextChild is null
+ assert( ( mayEmpty && this->n > 0 ) || this->n > 1 || this->nextChild.isNull() );
+ this->emptySize += sizeof(_KeyNode);
+ this->n--;
+ for ( int j = keypos; j < this->n; j++ )
+ k(j) = k(j+1);
+ setNotPacked();
+ }
+
+ /**
+ * pull rightmost key from the bucket. this version requires its right child to be null so it
+ * does not bother returning that value.
+ */
+ template< class V >
+ void BucketBasics<V>::popBack(DiskLoc& recLoc, Key &key) {
+ massert( 10282 , "n==0 in btree popBack()", this->n > 0 );
+ assert( k(this->n-1).isUsed() ); // no unused skipping in this function at this point - btreebuilder doesn't require that
+ KeyNode kn = keyNode(this->n-1);
+ recLoc = kn.recordLoc;
+ key.assign(kn.key);
+ int keysize = kn.key.dataSize();
+
+ massert( 10283 , "rchild not null in btree popBack()", this->nextChild.isNull());
+
+ // weirdly, we also put the rightmost down pointer in nextchild, even when bucket isn't full.
+ this->nextChild = kn.prevChildBucket;
+
+ this->n--;
+ // This is risky because the key we are returning points to this unalloc'ed memory,
+ // and we are assuming that the last key points to the last allocated
+ // bson region.
+ this->emptySize += sizeof(_KeyNode);
+ _unalloc(keysize);
+ }
+
+ /** add a key. must be > all existing. be careful to set next ptr right. */
+ template< class V >
+ bool BucketBasics<V>::_pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild) {
+ int bytesNeeded = key.dataSize() + sizeof(_KeyNode);
+ if ( bytesNeeded > this->emptySize )
+ return false;
+ assert( bytesNeeded <= this->emptySize );
+ if( this->n ) {
+ const KeyNode klast = keyNode(this->n-1);
+ if( klast.key.woCompare(key, order) > 0 ) {
+ log() << "btree bucket corrupt? consider reindexing or running validate command" << endl;
+ log() << " klast: " << keyNode(this->n-1).key.toString() << endl;
+ log() << " key: " << key.toString() << endl;
+ DEV klast.key.woCompare(key, order);
+ assert(false);
+ }
+ }
+ this->emptySize -= sizeof(_KeyNode);
+ _KeyNode& kn = k(this->n++);
+ kn.prevChildBucket = prevChild;
+ kn.recordLoc = recordLoc;
+ kn.setKeyDataOfs( (short) _alloc(key.dataSize()) );
+ short ofs = kn.keyDataOfs();
+ char *p = dataAt(ofs);
+ memcpy(p, key.data(), key.dataSize());
+
+ return true;
+ }
+
+ /* durability note
+ we do separate intent declarations herein. arguably one could just declare
+ the whole bucket given we do group commits. this is something we could investigate
+ later as to what is faster under what situations.
+ */
+ /** insert a key in a bucket with no complexity -- no splits required
+ @return false if a split is required.
+ */
+ template< class V >
+ bool BucketBasics<V>::basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const Key& key, const Ordering &order) const {
+ check( this->n < 1024 );
+ check( keypos >= 0 && keypos <= this->n );
+ int bytesNeeded = key.dataSize() + sizeof(_KeyNode);
+ if ( bytesNeeded > this->emptySize ) {
+ _pack(thisLoc, order, keypos);
+ if ( bytesNeeded > this->emptySize )
+ return false;
+ }
+
+ BucketBasics *b;
+ {
+ const char *p = (const char *) &k(keypos);
+ const char *q = (const char *) &k(this->n+1);
+ // declare that we will write to [k(keypos),k(n)]
+ // todo: this writes a medium amount to the journal. we may want to add a verb "shift" to the redo log so
+ // we can log a very small amount.
+ b = (BucketBasics*) getDur().writingAtOffset((void *) this, p-(char*)this, q-p);
+
+ // e.g. n==3, keypos==2
+ // 1 4 9
+ // ->
+ // 1 4 _ 9
+ for ( int j = this->n; j > keypos; j-- ) // make room
+ b->k(j) = b->k(j-1);
+ }
+
+ getDur().declareWriteIntent(&b->emptySize, sizeof(this->emptySize)+sizeof(this->topSize)+sizeof(this->n));
+ b->emptySize -= sizeof(_KeyNode);
+ b->n++;
+
+ // This _KeyNode was marked for writing above.
+ _KeyNode& kn = b->k(keypos);
+ kn.prevChildBucket.Null();
+ kn.recordLoc = recordLoc;
+ kn.setKeyDataOfs((short) b->_alloc(key.dataSize()) );
+ char *p = b->dataAt(kn.keyDataOfs());
+ getDur().declareWriteIntent(p, key.dataSize());
+ memcpy(p, key.data(), key.dataSize());
+ return true;
+ }
+
+ /**
+ * With this implementation, refPos == 0 disregards effect of refPos.
+ * index > 0 prevents creation of an empty bucket.
+ */
+ template< class V >
+ bool BucketBasics<V>::mayDropKey( int index, int refPos ) const {
+ return index > 0 && ( index != refPos ) && k( index ).isUnused() && k( index ).prevChildBucket.isNull();
+ }
+
+ template< class V >
+ int BucketBasics<V>::packedDataSize( int refPos ) const {
+ if ( this->flags & Packed ) {
+ return V::BucketSize - this->emptySize - headerSize();
+ }
+ int size = 0;
+ for( int j = 0; j < this->n; ++j ) {
+ if ( mayDropKey( j, refPos ) ) {
+ continue;
+ }
+ size += keyNode( j ).key.dataSize() + sizeof( _KeyNode );
+ }
+ return size;
+ }
+
+ /**
+ * when we delete things we just leave empty space until the node is
+ * full and then we repack it.
+ */
+ template< class V >
+ void BucketBasics<V>::_pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const {
+ if ( this->flags & Packed )
+ return;
+
+ VERIFYTHISLOC
+
+ /** TODO perhaps this can be optimized. for example if packing does no write, we can skip intent decl.
+ an empirical approach is probably best than just adding new code : perhaps the bucket would need
+ declaration anyway within the group commit interval, in which case we would just be adding
+ code and complexity without benefit.
+ */
+ thisLoc.btreemod<V>()->_packReadyForMod(order, refPos);
+ }
+
+ /** version when write intent already declared */
+ template< class V >
+ void BucketBasics<V>::_packReadyForMod( const Ordering &order, int &refPos ) {
+ assertWritable();
+
+ if ( this->flags & Packed )
+ return;
+
+ int tdz = totalDataSize();
+ char temp[V::BucketSize];
+ int ofs = tdz;
+ this->topSize = 0;
+ int i = 0;
+ for ( int j = 0; j < this->n; j++ ) {
+ if( mayDropKey( j, refPos ) ) {
+ continue; // key is unused and has no children - drop it
+ }
+ if( i != j ) {
+ if ( refPos == j ) {
+ refPos = i; // i < j so j will never be refPos again
+ }
+ k( i ) = k( j );
+ }
+ short ofsold = k(i).keyDataOfs();
+ int sz = keyNode(i).key.dataSize();
+ ofs -= sz;
+ this->topSize += sz;
+ memcpy(temp+ofs, dataAt(ofsold), sz);
+ k(i).setKeyDataOfsSavingUse( ofs );
+ ++i;
+ }
+ if ( refPos == this->n ) {
+ refPos = i;
+ }
+ this->n = i;
+ int dataUsed = tdz - ofs;
+ memcpy(this->data + ofs, temp + ofs, dataUsed);
+
+ // assertWritable();
+ // TEMP TEST getDur().declareWriteIntent(this, sizeof(*this));
+
+ this->emptySize = tdz - dataUsed - this->n * sizeof(_KeyNode);
+ {
+ int foo = this->emptySize;
+ assert( foo >= 0 );
+ }
+
+ setPacked();
+
+ assertValid( order );
+ }
+
+ template< class V >
+ inline void BucketBasics<V>::truncateTo(int N, const Ordering &order, int &refPos) {
+ d.dbMutex.assertWriteLocked();
+ assertWritable();
+
+ this->n = N;
+ setNotPacked();
+ _packReadyForMod( order, refPos );
+ }
+
+ /**
+ * In the standard btree algorithm, we would split based on the
+ * existing keys _and_ the new key. But that's more work to
+ * implement, so we split the existing keys and then add the new key.
+ *
+ * There are several published heuristic algorithms for doing splits,
+ * but basically what you want are (1) even balancing between the two
+ * sides and (2) a small split key so the parent can have a larger
+ * branching factor.
+ *
+ * We just have a simple algorithm right now: if a key includes the
+ * halfway point (or 10% way point) in terms of bytes, split on that key;
+ * otherwise split on the key immediately to the left of the halfway
+ * point (or 10% point).
+ *
+ * This function is expected to be called on a packed bucket.
+ */
+ template< class V >
+ int BucketBasics<V>::splitPos( int keypos ) const {
+ assert( this->n > 2 );
+ int split = 0;
+ int rightSize = 0;
+ // when splitting a btree node, if the new key is greater than all the other keys, we should not do an even split, but a 90/10 split.
+ // see SERVER-983
+ // TODO I think we only want to do the 90% split on the rhs node of the tree.
+ int rightSizeLimit = ( this->topSize + sizeof( _KeyNode ) * this->n ) / ( keypos == this->n ? 10 : 2 );
+ for( int i = this->n - 1; i > -1; --i ) {
+ rightSize += keyNode( i ).key.dataSize() + sizeof( _KeyNode );
+ if ( rightSize > rightSizeLimit ) {
+ split = i;
+ break;
+ }
+ }
+ // safeguards - we must not create an empty bucket
+ if ( split < 1 ) {
+ split = 1;
+ }
+ else if ( split > this->n - 2 ) {
+ split = this->n - 2;
+ }
+
+ return split;
+ }
+
+ template< class V >
+ void BucketBasics<V>::reserveKeysFront( int nAdd ) {
+ assert( this->emptySize >= int( sizeof( _KeyNode ) * nAdd ) );
+ this->emptySize -= sizeof( _KeyNode ) * nAdd;
+ for( int i = this->n - 1; i > -1; --i ) {
+ k( i + nAdd ) = k( i );
+ }
+ this->n += nAdd;
+ }
+
+ template< class V >
+ void BucketBasics<V>::setKey( int i, const DiskLoc recordLoc, const Key &key, const DiskLoc prevChildBucket ) {
+ _KeyNode &kn = k( i );
+ kn.recordLoc = recordLoc;
+ kn.prevChildBucket = prevChildBucket;
+ short ofs = (short) _alloc( key.dataSize() );
+ kn.setKeyDataOfs( ofs );
+ char *p = dataAt( ofs );
+ memcpy( p, key.data(), key.dataSize() );
+ }
+
+ template< class V >
+ void BucketBasics<V>::dropFront( int nDrop, const Ordering &order, int &refpos ) {
+ for( int i = nDrop; i < this->n; ++i ) {
+ k( i - nDrop ) = k( i );
+ }
+ this->n -= nDrop;
+ setNotPacked();
+ _packReadyForMod( order, refpos );
+ }
+
+ /* - BtreeBucket --------------------------------------------------- */
+
+ /** @return largest key in the subtree. */
+ template< class V >
+ void BtreeBucket<V>::findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey) {
+ DiskLoc loc = thisLoc;
+ while ( 1 ) {
+ const BtreeBucket *b = loc.btree<V>();
+ if ( !b->nextChild.isNull() ) {
+ loc = b->nextChild;
+ continue;
+ }
+
+ assert(b->n>0);
+ largestLoc = loc;
+ largestKey = b->n-1;
+
+ break;
+ }
+ }
+
+ /**
+ * NOTE Currently the Ordering implementation assumes a compound index will
+ * not have more keys than an unsigned variable has bits. The same
+ * assumption is used in the implementation below with respect to the 'mask'
+ * variable.
+ *
+ * @param l a regular bsonobj
+ * @param rBegin composed partly of an existing bsonobj, and the remaining keys are taken from a vector of elements that frequently changes
+ *
+ * see
+ * jstests/index_check6.js
+ * https://jira.mongodb.org/browse/SERVER-371
+ */
+ /* static */
+ template< class V >
+ int BtreeBucket<V>::customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction ) {
+ BSONObjIterator ll( l );
+ BSONObjIterator rr( rBegin );
+ vector< const BSONElement * >::const_iterator rr2 = rEnd.begin();
+ vector< bool >::const_iterator inc = rEndInclusive.begin();
+ unsigned mask = 1;
+ for( int i = 0; i < rBeginLen; ++i, mask <<= 1 ) {
+ BSONElement lll = ll.next();
+ BSONElement rrr = rr.next();
+ ++rr2;
+ ++inc;
+
+ int x = lll.woCompare( rrr, false );
+ if ( o.descending( mask ) )
+ x = -x;
+ if ( x != 0 )
+ return x;
+ }
+ if ( rSup ) {
+ return -direction;
+ }
+ for( ; ll.more(); mask <<= 1 ) {
+ BSONElement lll = ll.next();
+ BSONElement rrr = **rr2;
+ ++rr2;
+ int x = lll.woCompare( rrr, false );
+ if ( o.descending( mask ) )
+ x = -x;
+ if ( x != 0 )
+ return x;
+ if ( !*inc ) {
+ return -direction;
+ }
+ ++inc;
+ }
+ return 0;
+ }
+
+ template< class V >
+ bool BtreeBucket<V>::exists(const IndexDetails& idx, const DiskLoc &thisLoc, const Key& key, const Ordering& order) const {
+ int pos;
+ bool found;
+ DiskLoc b = locate(idx, thisLoc, key, order, pos, found, minDiskLoc);
+
+ // skip unused keys
+ while ( 1 ) {
+ if( b.isNull() )
+ break;
+ const BtreeBucket *bucket = b.btree<V>();
+ const _KeyNode& kn = bucket->k(pos);
+ if ( kn.isUsed() )
+ return bucket->keyAt(pos).woEqual(key);
+ b = bucket->advance(b, pos, 1, "BtreeBucket<V>::exists");
+ }
+ return false;
+ }
+
+ template< class V >
+ bool BtreeBucket<V>::wouldCreateDup(
+ const IndexDetails& idx, const DiskLoc &thisLoc,
+ const Key& key, const Ordering& order,
+ const DiskLoc &self) const {
+ int pos;
+ bool found;
+ DiskLoc b = locate(idx, thisLoc, key, order, pos, found, minDiskLoc);
+
+ while ( !b.isNull() ) {
+ // we skip unused keys
+ const BtreeBucket *bucket = b.btree<V>();
+ const _KeyNode& kn = bucket->k(pos);
+ if ( kn.isUsed() ) {
+ if( bucket->keyAt(pos).woEqual(key) )
+ return kn.recordLoc != self;
+ break;
+ }
+ b = bucket->advance(b, pos, 1, "BtreeBucket<V>::dupCheck");
+ }
+
+ return false;
+ }
+
+ template< class V >
+ string BtreeBucket<V>::dupKeyError( const IndexDetails& idx , const Key& key ) {
+ stringstream ss;
+ ss << "E11000 duplicate key error ";
+ ss << "index: " << idx.indexNamespace() << " ";
+ ss << "dup key: " << key.toString();
+ return ss.str();
+ }
+
+ /**
+ * Find a key withing this btree bucket.
+ *
+ * When duplicate keys are allowed, we use the DiskLoc of the record as if it were part of the
+ * key. That assures that even when there are many duplicates (e.g., 1 million) for a key,
+ * our performance is still good.
+ *
+ * assertIfDup: if the key exists (ignoring the recordLoc), uassert
+ *
+ * pos: for existing keys k0...kn-1.
+ * returns # it goes BEFORE. so key[pos-1] < key < key[pos]
+ * returns n if it goes after the last existing key.
+ * note result might be an Unused location!
+ */
+
+ bool guessIncreasing = false;
+ template< class V >
+ bool BtreeBucket<V>::find(const IndexDetails& idx, const Key& key, const DiskLoc &rl,
+ const Ordering &order, int& pos, bool assertIfDup) const {
+ Loc recordLoc;
+ recordLoc = rl;
+ globalIndexCounters.btree( (char*)this );
+
+ // binary search for this key
+ bool dupsChecked = false;
+ int l=0;
+ int h=this->n-1;
+ int m = (l+h)/2;
+ if( guessIncreasing ) {
+ m = h;
+ }
+ while ( l <= h ) {
+ KeyNode M = this->keyNode(m);
+ int x = key.woCompare(M.key, order);
+ if ( x == 0 ) {
+ if( assertIfDup ) {
+ if( k(m).isUnused() ) {
+ // ok that key is there if unused. but we need to check that there aren't other
+ // entries for the key then. as it is very rare that we get here, we don't put any
+ // coding effort in here to make this particularly fast
+ if( !dupsChecked ) {
+ dupsChecked = true;
+ if( idx.head.btree<V>()->exists(idx, idx.head, key, order) ) {
+ if( idx.head.btree<V>()->wouldCreateDup(idx, idx.head, key, order, recordLoc) )
+ uasserted( ASSERT_ID_DUPKEY , dupKeyError( idx , key ) );
+ else
+ alreadyInIndex();
+ }
+ }
+ }
+ else {
+ if( M.recordLoc == recordLoc )
+ alreadyInIndex();
+ uasserted( ASSERT_ID_DUPKEY , dupKeyError( idx , key ) );
+ }
+ }
+
+ // dup keys allowed. use recordLoc as if it is part of the key
+ Loc unusedRL = M.recordLoc;
+ unusedRL.GETOFS() &= ~1; // so we can test equality without the used bit messing us up
+ x = recordLoc.compare(unusedRL);
+ }
+ if ( x < 0 ) // key < M.key
+ h = m-1;
+ else if ( x > 0 )
+ l = m+1;
+ else {
+ // found it.
+ pos = m;
+ return true;
+ }
+ m = (l+h)/2;
+ }
+ // not found
+ pos = l;
+ if ( pos != this->n ) {
+ Key keyatpos = keyNode(pos).key;
+ wassert( key.woCompare(keyatpos, order) <= 0 );
+ if ( pos > 0 ) {
+ if( !( keyNode(pos-1).key.woCompare(key, order) <= 0 ) ) {
+ DEV {
+ log() << key.toString() << endl;
+ log() << keyNode(pos-1).key.toString() << endl;
+ }
+ wassert(false);
+ }
+ }
+ }
+
+ return false;
+ }
+
+ template< class V >
+ void BtreeBucket<V>::delBucket(const DiskLoc thisLoc, const IndexDetails& id) {
+ ClientCursor::informAboutToDeleteBucket(thisLoc); // slow...
+ assert( !isHead() );
+
+ DiskLoc ll = this->parent;
+ const BtreeBucket *p = ll.btree<V>();
+ int parentIdx = indexInParent( thisLoc );
+ p->childForPos( parentIdx ).writing().Null();
+ deallocBucket( thisLoc, id );
+ }
+
+ template< class V >
+ void BtreeBucket<V>::deallocBucket(const DiskLoc thisLoc, const IndexDetails &id) {
+#if 0
+ // as a temporary defensive measure, we zap the whole bucket, AND don't truly delete
+ // it (meaning it is ineligible for reuse).
+ memset(this, 0, Size());
+#else
+ // defensive:
+ this->n = -1;
+ this->parent.Null();
+ string ns = id.indexNamespace();
+ theDataFileMgr._deleteRecord(nsdetails(ns.c_str()), ns.c_str(), thisLoc.rec(), thisLoc);
+#endif
+ }
+
+ /** note: may delete the entire bucket! this invalid upon return sometimes. */
+ template< class V >
+ void BtreeBucket<V>::delKeyAtPos( const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order) {
+ assert(this->n>0);
+ DiskLoc left = this->childForPos(p);
+
+ if ( this->n == 1 ) {
+ if ( left.isNull() && this->nextChild.isNull() ) {
+ this->_delKeyAtPos(p);
+ if ( isHead() ) {
+ // we don't delete the top bucket ever
+ }
+ else {
+ if ( !mayBalanceWithNeighbors( thisLoc, id, order ) ) {
+ // An empty bucket is only allowed as a transient state. If
+ // there are no neighbors to balance with, we delete ourself.
+ // This condition is only expected in legacy btrees.
+ delBucket(thisLoc, id);
+ }
+ }
+ return;
+ }
+ deleteInternalKey( thisLoc, p, id, order );
+ return;
+ }
+
+ if ( left.isNull() ) {
+ this->_delKeyAtPos(p);
+ mayBalanceWithNeighbors( thisLoc, id, order );
+ }
+ else {
+ deleteInternalKey( thisLoc, p, id, order );
+ }
+ }
+
+ /**
+ * This function replaces the specified key (k) by either the prev or next
+ * key in the btree (k'). We require that k have either a left or right
+ * child. If k has a left child, we set k' to the prev key of k, which must
+ * be a leaf present in the left child. If k does not have a left child, we
+ * set k' to the next key of k, which must be a leaf present in the right
+ * child. When we replace k with k', we copy k' over k (which may cause a
+ * split) and then remove k' from its original location. Because k' is
+ * stored in a descendent of k, replacing k by k' will not modify the
+ * storage location of the original k', and we can easily remove k' from
+ * its original location.
+ *
+ * This function is only needed in cases where k has a left or right child;
+ * in other cases a simpler key removal implementation is possible.
+ *
+ * NOTE on legacy btree structures:
+ * In legacy btrees, k' can be a nonleaf. In such a case we 'delete' k by
+ * marking it as an unused node rather than replacing it with k'. Also, k'
+ * may be a leaf but marked as an unused node. In such a case we replace
+ * k by k', preserving the key's unused marking. This function is only
+ * expected to mark a key as unused when handling a legacy btree.
+ */
+ template< class V >
+ void BtreeBucket<V>::deleteInternalKey( const DiskLoc thisLoc, int keypos, IndexDetails &id, const Ordering &order ) {
+ DiskLoc lchild = this->childForPos( keypos );
+ DiskLoc rchild = this->childForPos( keypos + 1 );
+ assert( !lchild.isNull() || !rchild.isNull() );
+ int advanceDirection = lchild.isNull() ? 1 : -1;
+ int advanceKeyOfs = keypos;
+ DiskLoc advanceLoc = advance( thisLoc, advanceKeyOfs, advanceDirection, __FUNCTION__ );
+ // advanceLoc must be a descentant of thisLoc, because thisLoc has a
+ // child in the proper direction and all descendants of thisLoc must be
+ // nonempty because they are not the root.
+
+ if ( !advanceLoc.btree<V>()->childForPos( advanceKeyOfs ).isNull() ||
+ !advanceLoc.btree<V>()->childForPos( advanceKeyOfs + 1 ).isNull() ) {
+ // only expected with legacy btrees, see note above
+ this->markUnused( keypos );
+ return;
+ }
+
+ KeyNode kn = advanceLoc.btree<V>()->keyNode( advanceKeyOfs );
+ // Because advanceLoc is a descendant of thisLoc, updating thisLoc will
+ // not affect packing or keys of advanceLoc and kn will be stable
+ // during the following setInternalKey()
+ setInternalKey( thisLoc, keypos, kn.recordLoc, kn.key, order, this->childForPos( keypos ), this->childForPos( keypos + 1 ), id );
+ advanceLoc.btreemod<V>()->delKeyAtPos( advanceLoc, id, advanceKeyOfs, order );
+ }
+
+//#define BTREE(loc) (static_cast<DiskLoc>(loc).btree<V>())
+#define BTREE(loc) (loc.template btree<V>())
+//#define BTREEMOD(loc) (static_cast<DiskLoc>(loc).btreemod<V>())
+#define BTREEMOD(loc) (loc.template btreemod<V>())
+
+ template< class V >
+ void BtreeBucket<V>::replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id ) {
+ assert( this->n == 0 && !this->nextChild.isNull() );
+ if ( this->parent.isNull() ) {
+ assert( id.head == thisLoc );
+ id.head.writing() = this->nextChild;
+ }
+ else {
+ DiskLoc ll = this->parent;
+ ll.btree<V>()->childForPos( indexInParent( thisLoc ) ).writing() = this->nextChild;
+ }
+ BTREE(this->nextChild)->parent.writing() = this->parent;
+ ClientCursor::informAboutToDeleteBucket( thisLoc );
+ deallocBucket( thisLoc, id );
+ }
+
+ template< class V >
+ bool BtreeBucket<V>::canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const {
+ assert( leftIndex >= 0 && leftIndex < this->n );
+ DiskLoc leftNodeLoc = this->childForPos( leftIndex );
+ DiskLoc rightNodeLoc = this->childForPos( leftIndex + 1 );
+ if ( leftNodeLoc.isNull() || rightNodeLoc.isNull() ) {
+ // TODO if this situation is possible in long term implementation, maybe we should compact somehow anyway
+ return false;
+ }
+ int pos = 0;
+ {
+ const BtreeBucket *l = leftNodeLoc.btree<V>();
+ const BtreeBucket *r = rightNodeLoc.btree<V>();
+ if ( ( this->headerSize() + l->packedDataSize( pos ) + r->packedDataSize( pos ) + keyNode( leftIndex ).key.dataSize() + sizeof(_KeyNode) > unsigned( V::BucketSize ) ) ) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * This implementation must respect the meaning and value of lowWaterMark.
+ * Also see comments in splitPos().
+ */
+ template< class V >
+ int BtreeBucket<V>::rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const {
+ int split = -1;
+ int rightSize = 0;
+ const BtreeBucket *l = BTREE(this->childForPos( leftIndex ));
+ const BtreeBucket *r = BTREE(this->childForPos( leftIndex + 1 ));
+
+ int KNS = sizeof( _KeyNode );
+ int rightSizeLimit = ( l->topSize + l->n * KNS + keyNode( leftIndex ).key.dataSize() + KNS + r->topSize + r->n * KNS ) / 2;
+ // This constraint should be ensured by only calling this function
+ // if we go below the low water mark.
+ assert( rightSizeLimit < BtreeBucket<V>::bodySize() );
+ for( int i = r->n - 1; i > -1; --i ) {
+ rightSize += r->keyNode( i ).key.dataSize() + KNS;
+ if ( rightSize > rightSizeLimit ) {
+ split = l->n + 1 + i;
+ break;
+ }
+ }
+ if ( split == -1 ) {
+ rightSize += keyNode( leftIndex ).key.dataSize() + KNS;
+ if ( rightSize > rightSizeLimit ) {
+ split = l->n;
+ }
+ }
+ if ( split == -1 ) {
+ for( int i = l->n - 1; i > -1; --i ) {
+ rightSize += l->keyNode( i ).key.dataSize() + KNS;
+ if ( rightSize > rightSizeLimit ) {
+ split = i;
+ break;
+ }
+ }
+ }
+ // safeguards - we must not create an empty bucket
+ if ( split < 1 ) {
+ split = 1;
+ }
+ else if ( split > l->n + 1 + r->n - 2 ) {
+ split = l->n + 1 + r->n - 2;
+ }
+
+ return split;
+ }
+
+ template< class V >
+ void BtreeBucket<V>::doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) {
+ DiskLoc leftNodeLoc = this->childForPos( leftIndex );
+ DiskLoc rightNodeLoc = this->childForPos( leftIndex + 1 );
+ BtreeBucket *l = leftNodeLoc.btreemod<V>();
+ BtreeBucket *r = rightNodeLoc.btreemod<V>();
+ int pos = 0;
+ l->_packReadyForMod( order, pos );
+ r->_packReadyForMod( order, pos ); // pack r in case there are droppable keys
+
+ // We know the additional keys below will fit in l because canMergeChildren()
+ // must be true.
+ int oldLNum = l->n;
+ {
+ KeyNode kn = keyNode( leftIndex );
+ l->pushBack( kn.recordLoc, kn.key, order, l->nextChild ); // left child's right child becomes old parent key's left child
+ }
+ for( int i = 0; i < r->n; ++i ) {
+ KeyNode kn = r->keyNode( i );
+ l->pushBack( kn.recordLoc, kn.key, order, kn.prevChildBucket );
+ }
+ l->nextChild = r->nextChild;
+ l->fixParentPtrs( leftNodeLoc, oldLNum );
+ r->delBucket( rightNodeLoc, id );
+ this->childForPos( leftIndex + 1 ) = leftNodeLoc;
+ this->childForPos( leftIndex ) = DiskLoc();
+ this->_delKeyAtPos( leftIndex, true );
+ if ( this->n == 0 ) {
+ // will trash this and thisLoc
+ // TODO To ensure all leaves are of equal height, we should ensure
+ // this is only called on the root.
+ replaceWithNextChild( thisLoc, id );
+ }
+ else {
+ // balance recursively - maybe we should do this even when n == 0?
+ mayBalanceWithNeighbors( thisLoc, id, order );
+ }
+ }
+
+ template< class V >
+ int BtreeBucket<V>::indexInParent( const DiskLoc &thisLoc ) const {
+ assert( !this->parent.isNull() );
+ const BtreeBucket *p = BTREE(this->parent);
+ if ( p->nextChild == thisLoc ) {
+ return p->n;
+ }
+ else {
+ for( int i = 0; i < p->n; ++i ) {
+ if ( p->k( i ).prevChildBucket == thisLoc ) {
+ return i;
+ }
+ }
+ }
+ out() << "ERROR: can't find ref to child bucket.\n";
+ out() << "child: " << thisLoc << "\n";
+ dump();
+ out() << "Parent: " << this->parent << "\n";
+ p->dump();
+ assert(false);
+ return -1; // just to compile
+ }
+
+ template< class V >
+ bool BtreeBucket<V>::tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const {
+ // If we can merge, then we must merge rather than balance to preserve
+ // bucket utilization constraints.
+ if ( canMergeChildren( thisLoc, leftIndex ) ) {
+ return false;
+ }
+ thisLoc.btreemod<V>()->doBalanceChildren( thisLoc, leftIndex, id, order );
+ return true;
+ }
+
+ template< class V >
+ void BtreeBucket<V>::doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split,
+ BtreeBucket *l, const DiskLoc lchild,
+ BtreeBucket *r, const DiskLoc rchild,
+ IndexDetails &id, const Ordering &order ) {
+ // TODO maybe do some audits the same way pushBack() does?
+ // As a precondition, rchild + the old separator are <= half a body size,
+ // and lchild is at most completely full. Based on the value of split,
+ // rchild will get <= half of the total bytes which is at most 75%
+ // of a full body. So rchild will have room for the following keys:
+ int rAdd = l->n - split;
+ r->reserveKeysFront( rAdd );
+ for( int i = split + 1, j = 0; i < l->n; ++i, ++j ) {
+ KeyNode kn = l->keyNode( i );
+ r->setKey( j, kn.recordLoc, kn.key, kn.prevChildBucket );
+ }
+ {
+ KeyNode kn = keyNode( leftIndex );
+ r->setKey( rAdd - 1, kn.recordLoc, kn.key, l->nextChild ); // left child's right child becomes old parent key's left child
+ }
+ r->fixParentPtrs( rchild, 0, rAdd - 1 );
+ {
+ KeyNode kn = l->keyNode( split );
+ l->nextChild = kn.prevChildBucket;
+ // Because lchild is a descendant of thisLoc, updating thisLoc will
+ // not affect packing or keys of lchild and kn will be stable
+ // during the following setInternalKey()
+ setInternalKey( thisLoc, leftIndex, kn.recordLoc, kn.key, order, lchild, rchild, id );
+ }
+ int zeropos = 0;
+ // lchild and rchild cannot be merged, so there must be >0 (actually more)
+ // keys to the left of split.
+ l->truncateTo( split, order, zeropos );
+ }
+
+ template< class V >
+ void BtreeBucket<V>::doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split,
+ BtreeBucket *l, const DiskLoc lchild,
+ BtreeBucket *r, const DiskLoc rchild,
+ IndexDetails &id, const Ordering &order ) {
+ // As a precondition, lchild + the old separator are <= half a body size,
+ // and rchild is at most completely full. Based on the value of split,
+ // lchild will get less than half of the total bytes which is at most 75%
+ // of a full body. So lchild will have room for the following keys:
+ int lN = l->n;
+ {
+ KeyNode kn = keyNode( leftIndex );
+ l->pushBack( kn.recordLoc, kn.key, order, l->nextChild ); // left child's right child becomes old parent key's left child
+ }
+ for( int i = 0; i < split - lN - 1; ++i ) {
+ KeyNode kn = r->keyNode( i );
+ l->pushBack( kn.recordLoc, kn.key, order, kn.prevChildBucket );
+ }
+ {
+ KeyNode kn = r->keyNode( split - lN - 1 );
+ l->nextChild = kn.prevChildBucket;
+ // Child lN was lchild's old nextChild, and don't need to fix that one.
+ l->fixParentPtrs( lchild, lN + 1, l->n );
+ // Because rchild is a descendant of thisLoc, updating thisLoc will
+ // not affect packing or keys of rchild and kn will be stable
+ // during the following setInternalKey()
+ setInternalKey( thisLoc, leftIndex, kn.recordLoc, kn.key, order, lchild, rchild, id );
+ }
+ int zeropos = 0;
+ // lchild and rchild cannot be merged, so there must be >0 (actually more)
+ // keys to the right of split.
+ r->dropFront( split - lN, order, zeropos );
+ }
+
+ template< class V >
+ void BtreeBucket<V>::doBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) {
+ DiskLoc lchild = this->childForPos( leftIndex );
+ DiskLoc rchild = this->childForPos( leftIndex + 1 );
+ int zeropos = 0;
+ BtreeBucket *l = lchild.btreemod<V>();
+ l->_packReadyForMod( order, zeropos );
+ BtreeBucket *r = rchild.btreemod<V>();
+ r->_packReadyForMod( order, zeropos );
+ int split = rebalancedSeparatorPos( thisLoc, leftIndex );
+
+ // By definition, if we are below the low water mark and cannot merge
+ // then we must actively balance.
+ assert( split != l->n );
+ if ( split < l->n ) {
+ doBalanceLeftToRight( thisLoc, leftIndex, split, l, lchild, r, rchild, id, order );
+ }
+ else {
+ doBalanceRightToLeft( thisLoc, leftIndex, split, l, lchild, r, rchild, id, order );
+ }
+ }
+
+ template< class V >
+ bool BtreeBucket<V>::mayBalanceWithNeighbors( const DiskLoc thisLoc, IndexDetails &id, const Ordering &order ) const {
+ if ( this->parent.isNull() ) { // we are root, there are no neighbors
+ return false;
+ }
+
+ if ( this->packedDataSize( 0 ) >= this->lowWaterMark() ) {
+ return false;
+ }
+
+ const BtreeBucket *p = BTREE(this->parent);
+ int parentIdx = indexInParent( thisLoc );
+
+ // TODO will missing neighbor case be possible long term? Should we try to merge/balance somehow in that case if so?
+ bool mayBalanceRight = ( ( parentIdx < p->n ) && !p->childForPos( parentIdx + 1 ).isNull() );
+ bool mayBalanceLeft = ( ( parentIdx > 0 ) && !p->childForPos( parentIdx - 1 ).isNull() );
+
+ // Balance if possible on one side - we merge only if absolutely necessary
+ // to preserve btree bucket utilization constraints since that's a more
+ // heavy duty operation (especially if we must re-split later).
+ if ( mayBalanceRight &&
+ p->tryBalanceChildren( this->parent, parentIdx, id, order ) ) {
+ return true;
+ }
+ if ( mayBalanceLeft &&
+ p->tryBalanceChildren( this->parent, parentIdx - 1, id, order ) ) {
+ return true;
+ }
+
+ BtreeBucket *pm = BTREEMOD(this->parent);
+ if ( mayBalanceRight ) {
+ pm->doMergeChildren( this->parent, parentIdx, id, order );
+ return true;
+ }
+ else if ( mayBalanceLeft ) {
+ pm->doMergeChildren( this->parent, parentIdx - 1, id, order );
+ return true;
+ }
+
+ return false;
+ }
+
+ /** remove a key from the index */
+ template< class V >
+ bool BtreeBucket<V>::unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc ) const {
+ int pos;
+ bool found;
+ const Ordering ord = Ordering::make(id.keyPattern());
+ DiskLoc loc = locate(id, thisLoc, key, ord, pos, found, recordLoc, 1);
+ if ( found ) {
+ if ( key.objsize() > this->KeyMax ) {
+ OCCASIONALLY problem() << "unindex: key too large to index but was found for " << id.indexNamespace() << " reIndex suggested" << endl;
+ }
+ loc.btreemod<V>()->delKeyAtPos(loc, id, pos, ord);
+ return true;
+ }
+ return false;
+ }
+
+ template< class V >
+ inline void BtreeBucket<V>::fix(const DiskLoc thisLoc, const DiskLoc child) {
+ if ( !child.isNull() ) {
+ if ( insert_debug )
+ out() << " fix " << child.toString() << ".parent=" << thisLoc.toString() << endl;
+ child.btree<V>()->parent.writing() = thisLoc;
+ }
+ }
+
+ /**
+ * This can cause a lot of additional page writes when we assign buckets to
+ * different parents. Maybe get rid of parent ptrs?
+ */
+ template< class V >
+ void BtreeBucket<V>::fixParentPtrs(const DiskLoc thisLoc, int firstIndex, int lastIndex) const {
+ VERIFYTHISLOC
+ if ( lastIndex == -1 ) {
+ lastIndex = this->n;
+ }
+ for ( int i = firstIndex; i <= lastIndex; i++ ) {
+ fix(thisLoc, this->childForPos(i));
+ }
+ }
+
+ template< class V >
+ void BtreeBucket<V>::setInternalKey( const DiskLoc thisLoc, int keypos,
+ const DiskLoc recordLoc, const Key &key, const Ordering &order,
+ const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx ) {
+ this->childForPos( keypos ).Null();
+
+ // This may leave the bucket empty (n == 0) which is ok only as a
+ // transient state. In the instant case, the implementation of
+ // insertHere behaves correctly when n == 0 and as a side effect
+ // increments n.
+ this->_delKeyAtPos( keypos, true );
+
+ // Ensure we do not orphan neighbor's old child.
+ assert( this->childForPos( keypos ) == rchild );
+
+ // Just set temporarily - required to pass validation in insertHere()
+ this->childForPos( keypos ) = lchild;
+
+ insertHere( thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx );
+ }
+
+ /**
+ * insert a key in this bucket, splitting if necessary.
+ * @keypos - where to insert the key in range 0..n. 0=make leftmost, n=make rightmost.
+ * NOTE this function may free some data, and as a result the value passed for keypos may
+ * be invalid after calling insertHere()
+ *
+ * Some of the write intent signaling below relies on the implementation of
+ * the optimized write intent code in basicInsert().
+ */
+ template< class V >
+ void BtreeBucket<V>::insertHere( const DiskLoc thisLoc, int keypos,
+ const DiskLoc recordLoc, const Key& key, const Ordering& order,
+ const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx) const {
+ if ( insert_debug )
+ out() << " " << thisLoc.toString() << ".insertHere " << key.toString() << '/' << recordLoc.toString() << ' '
+ << lchild.toString() << ' ' << rchild.toString() << " keypos:" << keypos << endl;
+
+ if ( !this->basicInsert(thisLoc, keypos, recordLoc, key, order) ) {
+ // If basicInsert() fails, the bucket will be packed as required by split().
+ thisLoc.btreemod<V>()->split(thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx);
+ return;
+ }
+
+ {
+ const _KeyNode *_kn = &k(keypos);
+ _KeyNode *kn = (_KeyNode *) getDur().alreadyDeclared((_KeyNode*) _kn); // already declared intent in basicInsert()
+ if ( keypos+1 == this->n ) { // last key
+ if ( this->nextChild != lchild ) {
+ out() << "ERROR nextChild != lchild" << endl;
+ out() << " thisLoc: " << thisLoc.toString() << ' ' << idx.indexNamespace() << endl;
+ out() << " keyPos: " << keypos << " n:" << this->n << endl;
+ out() << " nextChild: " << this->nextChild.toString() << " lchild: " << lchild.toString() << endl;
+ out() << " recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl;
+ out() << " key: " << key.toString() << endl;
+ dump();
+ assert(false);
+ }
+ kn->prevChildBucket = this->nextChild;
+ assert( kn->prevChildBucket == lchild );
+ this->nextChild.writing() = rchild;
+ if ( !rchild.isNull() )
+ BTREE(rchild)->parent.writing() = thisLoc;
+ }
+ else {
+ kn->prevChildBucket = lchild;
+ if ( k(keypos+1).prevChildBucket != lchild ) {
+ out() << "ERROR k(keypos+1).prevChildBucket != lchild" << endl;
+ out() << " thisLoc: " << thisLoc.toString() << ' ' << idx.indexNamespace() << endl;
+ out() << " keyPos: " << keypos << " n:" << this->n << endl;
+ out() << " k(keypos+1).pcb: " << k(keypos+1).prevChildBucket.toString() << " lchild: " << lchild.toString() << endl;
+ out() << " recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl;
+ out() << " key: " << key.toString() << endl;
+ dump();
+ assert(false);
+ }
+ const Loc *pc = &k(keypos+1).prevChildBucket;
+ *getDur().alreadyDeclared( const_cast<Loc*>(pc) ) = rchild; // declared in basicInsert()
+ if ( !rchild.isNull() )
+ rchild.btree<V>()->parent.writing() = thisLoc;
+ }
+ return;
+ }
+ }
+
+ template< class V >
+ void BtreeBucket<V>::split(const DiskLoc thisLoc, int keypos, const DiskLoc recordLoc, const Key& key, const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx) {
+ this->assertWritable();
+
+ if ( split_debug )
+ out() << " " << thisLoc.toString() << ".split" << endl;
+
+ int split = this->splitPos( keypos );
+ DiskLoc rLoc = addBucket(idx);
+ BtreeBucket *r = rLoc.btreemod<V>();
+ if ( split_debug )
+ out() << " split:" << split << ' ' << keyNode(split).key.toString() << " n:" << this->n << endl;
+ for ( int i = split+1; i < this->n; i++ ) {
+ KeyNode kn = keyNode(i);
+ r->pushBack(kn.recordLoc, kn.key, order, kn.prevChildBucket);
+ }
+ r->nextChild = this->nextChild;
+ r->assertValid( order );
+
+ if ( split_debug )
+ out() << " new rLoc:" << rLoc.toString() << endl;
+ r = 0;
+ rLoc.btree<V>()->fixParentPtrs(rLoc);
+
+ {
+ KeyNode splitkey = keyNode(split);
+ this->nextChild = splitkey.prevChildBucket; // splitkey key gets promoted, its children will be thisLoc (l) and rLoc (r)
+ if ( split_debug ) {
+ out() << " splitkey key:" << splitkey.key.toString() << endl;
+ }
+
+ // Because thisLoc is a descendant of parent, updating parent will
+ // not affect packing or keys of thisLoc and splitkey will be stable
+ // during the following:
+
+ // promote splitkey to a parent this->node
+ if ( this->parent.isNull() ) {
+ // make a new parent if we were the root
+ DiskLoc L = addBucket(idx);
+ BtreeBucket *p = L.btreemod<V>();
+ p->pushBack(splitkey.recordLoc, splitkey.key, order, thisLoc);
+ p->nextChild = rLoc;
+ p->assertValid( order );
+ this->parent = idx.head.writing() = L;
+ if ( split_debug )
+ out() << " we were root, making new root:" << hex << this->parent.getOfs() << dec << endl;
+ rLoc.btree<V>()->parent.writing() = this->parent;
+ }
+ else {
+ // set this before calling _insert - if it splits it will do fixParent() logic and change the value.
+ rLoc.btree<V>()->parent.writing() = this->parent;
+ if ( split_debug )
+ out() << " promoting splitkey key " << splitkey.key.toString() << endl;
+ BTREE(this->parent)->_insert(this->parent, splitkey.recordLoc, splitkey.key, order, /*dupsallowed*/true, thisLoc, rLoc, idx);
+ }
+ }
+
+ int newpos = keypos;
+ // note this may trash splitkey.key. thus we had to promote it before finishing up here.
+ this->truncateTo(split, order, newpos);
+
+ // add our this->new key, there is room this->now
+ {
+ if ( keypos <= split ) {
+ if ( split_debug )
+ out() << " keypos<split, insertHere() the new key" << endl;
+ insertHere(thisLoc, newpos, recordLoc, key, order, lchild, rchild, idx);
+ }
+ else {
+ int kp = keypos-split-1;
+ assert(kp>=0);
+ BTREE(rLoc)->insertHere(rLoc, kp, recordLoc, key, order, lchild, rchild, idx);
+ }
+ }
+
+ if ( split_debug )
+ out() << " split end " << hex << thisLoc.getOfs() << dec << endl;
+ }
+
+ /** start a new index off, empty */
+ template< class V >
+ DiskLoc BtreeBucket<V>::addBucket(const IndexDetails& id) {
+ string ns = id.indexNamespace();
+ DiskLoc loc = theDataFileMgr.insert(ns.c_str(), 0, V::BucketSize, true);
+ BtreeBucket *b = BTREEMOD(loc);
+ b->init();
+ return loc;
+ }
+
+ void renameIndexNamespace(const char *oldNs, const char *newNs) {
+ renameNamespace( oldNs, newNs );
+ }
+
+ template< class V >
+ const DiskLoc BtreeBucket<V>::getHead(const DiskLoc& thisLoc) const {
+ DiskLoc p = thisLoc;
+ while ( !BTREE(p)->isHead() )
+ p = BTREE(p)->parent;
+ return p;
+ }
+
+ template< class V >
+ DiskLoc BtreeBucket<V>::advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const {
+ if ( keyOfs < 0 || keyOfs >= this->n ) {
+ out() << "ASSERT failure BtreeBucket<V>::advance, caller: " << caller << endl;
+ out() << " thisLoc: " << thisLoc.toString() << endl;
+ out() << " keyOfs: " << keyOfs << " n:" << this->n << " direction: " << direction << endl;
+ out() << bucketSummary() << endl;
+ assert(false);
+ }
+ int adj = direction < 0 ? 1 : 0;
+ int ko = keyOfs + direction;
+ DiskLoc nextDown = this->childForPos(ko+adj);
+ if ( !nextDown.isNull() ) {
+ while ( 1 ) {
+ keyOfs = direction>0 ? 0 : BTREE(nextDown)->n - 1;
+ DiskLoc loc = BTREE(nextDown)->childForPos(keyOfs + adj);
+ if ( loc.isNull() )
+ break;
+ nextDown = loc;
+ }
+ return nextDown;
+ }
+
+ if ( ko < this->n && ko >= 0 ) {
+ keyOfs = ko;
+ return thisLoc;
+ }
+
+ // end of bucket. traverse back up.
+ DiskLoc childLoc = thisLoc;
+ DiskLoc ancestor = this->parent;
+ while ( 1 ) {
+ if ( ancestor.isNull() )
+ break;
+ const BtreeBucket *an = BTREE(ancestor);
+ for ( int i = 0; i < an->n; i++ ) {
+ if ( an->childForPos(i+adj) == childLoc ) {
+ keyOfs = i;
+ return ancestor;
+ }
+ }
+ assert( direction<0 || an->nextChild == childLoc );
+ // parent exhausted also, keep going up
+ childLoc = ancestor;
+ ancestor = an->parent;
+ }
+
+ return DiskLoc();
+ }
+
+ template< class V >
+ DiskLoc BtreeBucket<V>::locate(const IndexDetails& idx, const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, int& pos, bool& found, const DiskLoc &recordLoc, int direction) const {
+ KeyOwned k(key);
+ return locate(idx, thisLoc, k, order, pos, found, recordLoc, direction);
+ }
+
+ template< class V >
+ DiskLoc BtreeBucket<V>::locate(const IndexDetails& idx, const DiskLoc& thisLoc, const Key& key, const Ordering &order, int& pos, bool& found, const DiskLoc &recordLoc, int direction) const {
+ int p;
+ found = find(idx, key, recordLoc, order, p, /*assertIfDup*/ false);
+ if ( found ) {
+ pos = p;
+ return thisLoc;
+ }
+
+ DiskLoc child = this->childForPos(p);
+
+ if ( !child.isNull() ) {
+ DiskLoc l = BTREE(child)->locate(idx, child, key, order, pos, found, recordLoc, direction);
+ if ( !l.isNull() )
+ return l;
+ }
+
+ pos = p;
+ if ( direction < 0 )
+ return --pos == -1 ? DiskLoc() /*theend*/ : thisLoc;
+ else
+ return pos == this->n ? DiskLoc() /*theend*/ : thisLoc;
+ }
+
+ template< class V >
+ bool BtreeBucket<V>::customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) {
+ const BtreeBucket<V> * bucket = BTREE(thisLoc);
+ while( 1 ) {
+ if ( l + 1 == h ) {
+ keyOfs = ( direction > 0 ) ? h : l;
+ DiskLoc next = bucket->k( h ).prevChildBucket;
+ if ( !next.isNull() ) {
+ bestParent = make_pair( thisLoc, keyOfs );
+ thisLoc = next;
+ return true;
+ }
+ else {
+ return false;
+ }
+ }
+ int m = l + ( h - l ) / 2;
+ int cmp = customBSONCmp( bucket->keyNode( m ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction );
+ if ( cmp < 0 ) {
+ l = m;
+ }
+ else if ( cmp > 0 ) {
+ h = m;
+ }
+ else {
+ if ( direction < 0 ) {
+ l = m;
+ }
+ else {
+ h = m;
+ }
+ }
+ }
+ }
+
+ /**
+ * find smallest/biggest value greater-equal/less-equal than specified
+ * starting thisLoc + keyOfs will be strictly less than/strictly greater than keyBegin/keyBeginLen/keyEnd
+ * All the direction checks below allowed me to refactor the code, but possibly separate forward and reverse implementations would be more efficient
+ */
+ template< class V >
+ void BtreeBucket<V>::advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const {
+ int l,h;
+ bool dontGoUp;
+ if ( direction > 0 ) {
+ l = keyOfs;
+ h = this->n - 1;
+ dontGoUp = ( customBSONCmp( keyNode( h ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 );
+ }
+ else {
+ l = 0;
+ h = keyOfs;
+ dontGoUp = ( customBSONCmp( keyNode( l ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 );
+ }
+ pair< DiskLoc, int > bestParent;
+ if ( dontGoUp ) {
+ // this comparison result assures h > l
+ if ( !customFind( l, h, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, thisLoc, keyOfs, bestParent ) ) {
+ return;
+ }
+ }
+ else {
+ // go up parents until rightmost/leftmost node is >=/<= target or at top
+ while( !BTREE(thisLoc)->parent.isNull() ) {
+ thisLoc = BTREE(thisLoc)->parent;
+ if ( direction > 0 ) {
+ if ( customBSONCmp( BTREE(thisLoc)->keyNode( BTREE(thisLoc)->n - 1 ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 ) {
+ break;
+ }
+ }
+ else {
+ if ( customBSONCmp( BTREE(thisLoc)->keyNode( 0 ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 ) {
+ break;
+ }
+ }
+ }
+ }
+ customLocate( thisLoc, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, bestParent );
+ }
+
+ /** @param thisLoc in/out param. perhaps thisLoc isn't the best name given that.
+ Ut is used by advanceTo, which skips
+ from one key to another key without necessarily checking all the keys
+ between them in the btree (it can skip to different btree buckets).
+ The advanceTo function can get called a lot, and for different targets
+ we want to advance to, don't want to create a bson obj in a new
+ buffer each time we call that function. The
+ customLocate function necessary for advanceTo, and does the same thing
+ as normal locate function but takes basically the same arguments
+ as advanceTo.
+ */
+ template< class V >
+ void BtreeBucket<V>::customLocate(DiskLoc &locInOut, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey,
+ const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive,
+ const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) {
+ dassert( direction == 1 || direction == -1 );
+ const BtreeBucket<V> *bucket = BTREE(locInOut);
+ if ( bucket->n == 0 ) {
+ locInOut = DiskLoc();
+ return;
+ }
+ // go down until find smallest/biggest >=/<= target
+ while( 1 ) {
+ int l = 0;
+ int h = bucket->n - 1;
+
+ // +direction: 0, -direction: h
+ int z = (1-direction)/2*h;
+
+ // leftmost/rightmost key may possibly be >=/<= search key
+ int res = customBSONCmp( bucket->keyNode( z ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction );
+ bool firstCheck = direction*res >= 0;
+
+ if ( firstCheck ) {
+ DiskLoc next;
+ keyOfs = z;
+ if ( direction > 0 ) {
+ dassert( z == 0 );
+ next = bucket->k( 0 ).prevChildBucket;
+ }
+ else {
+ next = bucket->nextChild;
+ }
+ if ( !next.isNull() ) {
+ bestParent = pair< DiskLoc, int >( locInOut, keyOfs );
+ locInOut = next;
+ bucket = BTREE(locInOut);
+ continue;
+ }
+ else {
+ return;
+ }
+ }
+
+ res = customBSONCmp( bucket->keyNode( h-z ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction );
+ bool secondCheck = direction*res < 0;
+
+ if ( secondCheck ) {
+ DiskLoc next;
+ if ( direction > 0 ) {
+ next = bucket->nextChild;
+ }
+ else {
+ next = bucket->k( 0 ).prevChildBucket;
+ }
+ if ( next.isNull() ) {
+ // if bestParent is null, we've hit the end and locInOut gets set to DiskLoc()
+ locInOut = bestParent.first;
+ keyOfs = bestParent.second;
+ return;
+ }
+ else {
+ locInOut = next;
+ bucket = BTREE(locInOut);
+ continue;
+ }
+ }
+
+ if ( !customFind( l, h, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, locInOut, keyOfs, bestParent ) ) {
+ return;
+ }
+ bucket = BTREE(locInOut);
+ }
+ }
+
+ /** @thisLoc disk location of *this */
+ template< class V >
+ void BtreeBucket<V>::insertStepOne(DiskLoc thisLoc,
+ Continuation<V>& c,
+ bool dupsAllowed) const {
+ dassert( c.key.dataSize() <= this->KeyMax );
+ assert( c.key.dataSize() > 0 );
+
+ int pos;
+ bool found = find(c.idx, c.key, c.recordLoc, c.order, pos, !dupsAllowed);
+
+ if ( found ) {
+ const _KeyNode& kn = k(pos);
+ if ( kn.isUnused() ) {
+ log(4) << "btree _insert: reusing unused key" << endl;
+ c.b = this;
+ c.pos = pos;
+ c.op = Continuation<V>::SetUsed;
+ return;
+ }
+
+ DEV {
+ log() << "_insert(): key already exists in index (ok for background:true)\n";
+ log() << " " << c.idx.indexNamespace() << " thisLoc:" << thisLoc.toString() << '\n';
+ log() << " " << c.key.toString() << '\n';
+ log() << " " << "recordLoc:" << c.recordLoc.toString() << " pos:" << pos << endl;
+ log() << " old l r: " << this->childForPos(pos).toString() << ' ' << this->childForPos(pos+1).toString() << endl;
+ }
+ alreadyInIndex();
+ }
+
+ Loc ch = this->childForPos(pos);
+ DiskLoc child = ch;
+
+ if ( child.isNull() ) {
+ // A this->new key will be inserted at the same tree height as an adjacent existing key.
+ c.bLoc = thisLoc;
+ c.b = this;
+ c.pos = pos;
+ c.op = Continuation<V>::InsertHere;
+ return;
+ }
+
+ child.btree<V>()->insertStepOne(child, c, dupsAllowed);
+ }
+
+ /** @thisLoc disk location of *this */
+ template< class V >
+ int BtreeBucket<V>::_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+ const Key& key, const Ordering &order, bool dupsAllowed,
+ const DiskLoc lChild, const DiskLoc rChild, IndexDetails& idx) const {
+ if ( key.dataSize() > this->KeyMax ) {
+ problem() << "ERROR: key too large len:" << key.dataSize() << " max:" << this->KeyMax << ' ' << key.dataSize() << ' ' << idx.indexNamespace() << endl;
+ return 2;
+ }
+ assert( key.dataSize() > 0 );
+
+ int pos;
+ bool found = find(idx, key, recordLoc, order, pos, !dupsAllowed);
+ if ( insert_debug ) {
+ out() << " " << thisLoc.toString() << '.' << "_insert " <<
+ key.toString() << '/' << recordLoc.toString() <<
+ " l:" << lChild.toString() << " r:" << rChild.toString() << endl;
+ out() << " found:" << found << " pos:" << pos << " n:" << this->n << endl;
+ }
+
+ if ( found ) {
+ const _KeyNode& kn = k(pos);
+ if ( kn.isUnused() ) {
+ log(4) << "btree _insert: reusing unused key" << endl;
+ massert( 10285 , "_insert: reuse key but lchild is not null", lChild.isNull());
+ massert( 10286 , "_insert: reuse key but rchild is not null", rChild.isNull());
+ kn.writing().setUsed();
+ return 0;
+ }
+
+ DEV {
+ log() << "_insert(): key already exists in index (ok for background:true)\n";
+ log() << " " << idx.indexNamespace() << " thisLoc:" << thisLoc.toString() << '\n';
+ log() << " " << key.toString() << '\n';
+ log() << " " << "recordLoc:" << recordLoc.toString() << " pos:" << pos << endl;
+ log() << " old l r: " << this->childForPos(pos).toString() << ' ' << this->childForPos(pos+1).toString() << endl;
+ log() << " new l r: " << lChild.toString() << ' ' << rChild.toString() << endl;
+ }
+ alreadyInIndex();
+ }
+
+ DEBUGGING out() << "TEMP: key: " << key.toString() << endl;
+ Loc ch = this->childForPos(pos);
+ DiskLoc child = ch;
+ if ( insert_debug )
+ out() << " getChild(" << pos << "): " << child.toString() << endl;
+ // In current usage, rChild isNull() for a new key and false when we are
+ // promoting a split key. These are the only two cases where _insert()
+ // is called currently.
+ if ( child.isNull() || !rChild.isNull() ) {
+ // A new key will be inserted at the same tree height as an adjacent existing key.
+ insertHere(thisLoc, pos, recordLoc, key, order, lChild, rChild, idx);
+ return 0;
+ }
+
+ return child.btree<V>()->_insert(child, recordLoc, key, order, dupsAllowed, /*lchild*/DiskLoc(), /*rchild*/DiskLoc(), idx);
+ }
+
+ template< class V >
+ void BtreeBucket<V>::dump(unsigned depth) const {
+ string indent = string(depth, ' ');
+ _log() << "BUCKET n:" << this->n;
+ _log() << " parent:" << hex << this->parent.getOfs() << dec;
+ for ( int i = 0; i < this->n; i++ ) {
+ _log() << '\n' << indent;
+ KeyNode k = keyNode(i);
+ string ks = k.key.toString();
+ _log() << " " << hex << k.prevChildBucket.getOfs() << '\n';
+ _log() << indent << " " << i << ' ' << ks.substr(0, 30) << " Loc:" << k.recordLoc.toString() << dec;
+ if ( this->k(i).isUnused() )
+ _log() << " UNUSED";
+ }
+ _log() << "\n" << indent << " " << hex << this->nextChild.getOfs() << dec << endl;
+ }
+
+ template< class V >
+ void BtreeBucket<V>::twoStepInsert(DiskLoc thisLoc, Continuation<V> &c, bool dupsAllowed) const
+ {
+
+ if ( c.key.dataSize() > this->KeyMax ) {
+ problem() << "ERROR: key too large len:" << c.key.dataSize() << " max:" << this->KeyMax << ' ' << c.key.dataSize() << ' ' << c.idx.indexNamespace() << endl;
+ return; // op=Nothing
+ }
+ insertStepOne(thisLoc, c, dupsAllowed);
+ }
+
+ /** todo: meaning of return code unclear clean up */
+ template< class V >
+ int BtreeBucket<V>::bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+ const BSONObj& _key, const Ordering &order, bool dupsAllowed,
+ IndexDetails& idx, bool toplevel) const
+ {
+ guessIncreasing = _key.firstElementType() == jstOID && idx.isIdIndex();
+ KeyOwned key(_key);
+
+ dassert(toplevel);
+ if ( toplevel ) {
+ if ( key.dataSize() > this->KeyMax ) {
+ problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace() << ' ' << key.dataSize() << ' ' << key.toString() << endl;
+ return 3;
+ }
+ }
+
+ int x;
+ try {
+ x = _insert(thisLoc, recordLoc, key, order, dupsAllowed, DiskLoc(), DiskLoc(), idx);
+ this->assertValid( order );
+ }
+ catch( ... ) {
+ guessIncreasing = false;
+ throw;
+ }
+ guessIncreasing = false;
+ return x;
+ }
+
+ template< class V >
+ void BtreeBucket<V>::shape(stringstream& ss) const {
+ this->_shape(0, ss);
+ }
+
+ template< class V >
+ int BtreeBucket<V>::getKeyMax() {
+ return V::KeyMax;
+ }
+
+ template< class V >
+ DiskLoc BtreeBucket<V>::findSingle( const IndexDetails& indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const {
+ int pos;
+ bool found;
+ // TODO: is it really ok here that the order is a default?
+ // for findById() use, yes. for checkNoIndexConflicts, no?
+ Ordering o = Ordering::make(BSONObj());
+ DiskLoc bucket = locate( indexdetails , indexdetails.head , key , o , pos , found , minDiskLoc );
+ if ( bucket.isNull() )
+ return bucket;
+
+ const BtreeBucket<V> *b = bucket.btree<V>();
+ while ( 1 ) {
+ const _KeyNode& knraw = b->k(pos);
+ if ( knraw.isUsed() )
+ break;
+ bucket = b->advance( bucket , pos , 1 , "findSingle" );
+ if ( bucket.isNull() )
+ return bucket;
+ b = bucket.btree<V>();
+ }
+ KeyNode kn = b->keyNode( pos );
+ if ( KeyOwned(key).woCompare( kn.key, o ) != 0 )
+ return DiskLoc();
+ return kn.recordLoc;
+ }
+
+} // namespace mongo
+
+#include "db.h"
+#include "dbhelpers.h"
+
+namespace mongo {
+
+ template< class V >
+ void BtreeBucket<V>::a_test(IndexDetails& id) {
+ BtreeBucket *b = id.head.btreemod<V>();
+
+ // record locs for testing
+ DiskLoc A(1, 20);
+ DiskLoc B(1, 30);
+ DiskLoc C(1, 40);
+
+ DiskLoc rl;
+ BSONObj key = fromjson("{x:9}");
+ BSONObj orderObj = fromjson("{}");
+ Ordering order = Ordering::make(orderObj);
+
+ b->bt_insert(id.head, A, key, order, true, id);
+ A.GETOFS() += 2;
+ b->bt_insert(id.head, A, key, order, true, id);
+ A.GETOFS() += 2;
+ b->bt_insert(id.head, A, key, order, true, id);
+ A.GETOFS() += 2;
+ b->bt_insert(id.head, A, key, order, true, id);
+ A.GETOFS() += 2;
+ assert( b->k(0).isUsed() );
+// b->k(0).setUnused();
+ b->k(1).setUnused();
+ b->k(2).setUnused();
+ b->k(3).setUnused();
+
+ b->dumpTree(id.head, orderObj);
+
+ /* b->bt_insert(id.head, B, key, order, false, id);
+ b->k(1).setUnused();
+ b->dumpTree(id.head, order);
+ b->bt_insert(id.head, A, key, order, false, id);
+ b->dumpTree(id.head, order);
+ */
+
+ // this should assert. does it? (it might "accidentally" though, not asserting proves a problem, asserting proves nothing)
+ b->bt_insert(id.head, C, key, order, false, id);
+
+ // b->dumpTree(id.head, order);
+ }
+
+ template class BucketBasics<V0>;
+ template class BucketBasics<V1>;
+ template class BtreeBucket<V0>;
+ template class BtreeBucket<V1>;
+ template struct __KeyNode<DiskLoc>;
+ template struct __KeyNode<DiskLoc56Bit>;
+
+ struct BTUnitTest : public UnitTest {
+ void run() {
+ DiskLoc big(0xf12312, 0x70001234);
+ DiskLoc56Bit bigl;
+ {
+ bigl = big;
+ assert( big == bigl );
+ DiskLoc e = bigl;
+ assert( big == e );
+ }
+ {
+ DiskLoc d;
+ assert( d.isNull() );
+ DiskLoc56Bit l;
+ l = d;
+ assert( l.isNull() );
+ d = l;
+ assert( d.isNull() );
+ assert( l < bigl );
+ }
+ }
+ } btunittest;
+
+}
diff --git a/src/mongo/db/btree.h b/src/mongo/db/btree.h
new file mode 100644
index 00000000000..85e5172d163
--- /dev/null
+++ b/src/mongo/db/btree.h
@@ -0,0 +1,1174 @@
+// btree.h
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "jsobj.h"
+#include "diskloc.h"
+#include "pdfile.h"
+#include "key.h"
+
+namespace mongo {
+
+ /**
+ * Our btree implementation generally follows the standard btree algorithm,
+ * which is described in many places. The nodes of our btree are referred to
+ * as buckets below. These buckets are of size BucketSize and their body is
+ * an ordered array of <bson key, disk loc> pairs, where disk loc is the disk
+ * location of a document and bson key is a projection of this document into
+ * the schema of the index for this btree. Ordering is determined on the
+ * basis of bson key first and then disk loc in case of a tie. All bson keys
+ * for a btree have identical schemas with empty string field names and may
+ * not have an objsize() exceeding KeyMax. The btree's buckets are
+ * themselves organized into an ordered tree. Although there are exceptions,
+ * generally buckets with n keys have n+1 children and the body of a bucket is
+ * at least lowWaterMark bytes. A more strictly enforced requirement is that
+ * a non root bucket must have at least one key except in certain transient
+ * states.
+ *
+ * Our btrees support the following primary read operations: finding a
+ * specified key; iterating from a starting key to the next or previous
+ * ordered key; and skipping from a starting key to another specified key
+ * without checking every intermediate key. The primary write operations
+ * are insertion and deletion of keys. Insertion may trigger a bucket split
+ * if necessary to avoid bucket overflow. In such a case, subsequent splits
+ * will occur recursively as necessary. Deletion may trigger a bucket
+ * rebalance, in which a size deficient bucket is filled with keys from an
+ * adjacent bucket. In this case, splitting may potentially occur in the
+ * parent. Deletion may alternatively trigger a merge, in which the keys
+ * from two buckets and a key from their shared parent are combined into the
+ * same bucket. In such a case, rebalancing or merging may proceed
+ * recursively from the parent.
+ *
+ * While the btree data format has been relatively constant over time, btrees
+ * initially created by versions of mongo earlier than the current version
+ * may embody different properties than freshly created btrees (while
+ * following the same data format). These older btrees are referred to
+ * below as legacy btrees.
+ */
+
+ const int OldBucketSize = 8192;
+
+#pragma pack(1)
+ template< class Version > class BucketBasics;
+
+ /**
+ * This is the fixed width data component for storage of a key within a
+ * bucket. It contains an offset pointer to the variable width bson
+ * data component. A _KeyNode may be 'unused', please see below.
+ */
+ template< class Loc >
+ struct __KeyNode {
+ /** Signals that we are writing this _KeyNode and casts away const */
+ __KeyNode<Loc> & writing() const;
+ /**
+ * The 'left' child bucket of this key. If this is the i-th key, it
+ * points to the i index child bucket.
+ */
+ Loc prevChildBucket;
+ /** The location of the record associated with this key. */
+ Loc recordLoc;
+ short keyDataOfs() const { return (short) _kdo; }
+
+ /** Offset within current bucket of the variable width bson key for this _KeyNode. */
+ unsigned short _kdo;
+ void setKeyDataOfs(short s) {
+ _kdo = s;
+ assert(s>=0);
+ }
+ /** Seems to be redundant. */
+ void setKeyDataOfsSavingUse(short s) {
+ _kdo = s;
+ assert(s>=0);
+ }
+ /**
+ * Unused keys are not returned by read operations. Keys may be marked
+ * as unused in cases where it is difficult to delete them while
+ * maintaining the constraints required of a btree.
+ *
+ * Setting ofs to odd is the sentinel for unused, as real recordLoc's
+ * are always even numbers. Note we need to keep its value basically
+ * the same as we use the recordLoc as part of the key in the index
+ * (to handle duplicate keys efficiently).
+ *
+ * Flagging keys as unused is a feature that is being phased out in favor
+ * of deleting the keys outright. The current btree implementation is
+ * not expected to mark a key as unused in a non legacy btree.
+ */
+ void setUnused() {
+ recordLoc.GETOFS() |= 1;
+ }
+ void setUsed() { recordLoc.GETOFS() &= ~1; }
+ int isUnused() const {
+ return recordLoc.getOfs() & 1;
+ }
+ int isUsed() const {
+ return !isUnused();
+ }
+ };
+
+ /**
+ * This structure represents header data for a btree bucket. An object of
+ * this type is typically allocated inside of a buffer of size BucketSize,
+ * resulting in a full bucket with an appropriate header.
+ *
+ * The body of a btree bucket contains an array of _KeyNode objects starting
+ * from its lowest indexed bytes and growing to higher indexed bytes. The
+ * body also contains variable width bson keys, which are allocated from the
+ * highest indexed bytes toward lower indexed bytes.
+ *
+ * |hhhh|kkkkkkk--------bbbbbbbbbbbuuubbbuubbb|
+ * h = header data
+ * k = KeyNode data
+ * - = empty space
+ * b = bson key data
+ * u = unused (old) bson key data, that may be garbage collected
+ */
+ class BtreeData_V0 {
+ protected:
+ /** Parent bucket of this bucket, which isNull() for the root bucket. */
+ DiskLoc parent;
+ /** Given that there are n keys, this is the n index child. */
+ DiskLoc nextChild;
+ /** can be reused, value is 8192 in current pdfile version Apr2010 */
+ unsigned short _wasSize;
+ /** zero */
+ unsigned short _reserved1;
+ int flags;
+
+ void _init() {
+ _reserved1 = 0;
+ _wasSize = BucketSize;
+ reserved = 0;
+ }
+
+ /** basicInsert() assumes the next three members are consecutive and in this order: */
+
+ /** Size of the empty region. */
+ int emptySize;
+ /** Size used for bson storage, including storage of old keys. */
+ int topSize;
+ /* Number of keys in the bucket. */
+ int n;
+
+ int reserved;
+ /* Beginning of the bucket's body */
+ char data[4];
+
+ public:
+ typedef __KeyNode<DiskLoc> _KeyNode;
+ typedef DiskLoc Loc;
+ typedef KeyBson Key;
+ typedef KeyBson KeyOwned;
+ enum { BucketSize = 8192 };
+
+ // largest key size we allow. note we very much need to support bigger keys (somehow) in the future.
+ static const int KeyMax = OldBucketSize / 10;
+ };
+
+ // a a a ofs ofs ofs ofs
+ class DiskLoc56Bit {
+ int ofs;
+ unsigned char _a[3];
+ unsigned long long Z() const {
+ // endian
+ return *((unsigned long long*)this) & 0x00ffffffffffffffULL;
+ }
+ enum {
+ // first bit of offsets used in _KeyNode we don't use -1 here.
+ OurNullOfs = -2
+ };
+ public:
+ template< class V >
+ const BtreeBucket<V> * btree() const {
+ return DiskLoc(*this).btree<V>();
+ }
+ template< class V >
+ BtreeBucket<V> * btreemod() const {
+ return DiskLoc(*this).btreemod<V>();
+ }
+ operator const DiskLoc() const {
+ // endian
+ if( isNull() ) return DiskLoc();
+ unsigned a = *((unsigned *) (_a-1));
+ return DiskLoc(a >> 8, ofs);
+ }
+ int& GETOFS() { return ofs; }
+ int getOfs() const { return ofs; }
+ bool operator<(const DiskLoc56Bit& rhs) const {
+ // the orderering of dup keys in btrees isn't too critical, but we'd like to put items that are
+ // close together on disk close together in the tree, so we do want the file # to be the most significant
+ // bytes
+ return Z() < rhs.Z();
+ }
+ int compare(const DiskLoc56Bit& rhs) const {
+ unsigned long long a = Z();
+ unsigned long long b = rhs.Z();
+ if( a < b ) return -1;
+ return a == b ? 0 : 1;
+ }
+ bool operator==(const DiskLoc56Bit& rhs) const { return Z() == rhs.Z(); }
+ bool operator!=(const DiskLoc56Bit& rhs) const { return Z() != rhs.Z(); }
+ bool operator==(const DiskLoc& rhs) const {
+ return DiskLoc(*this) == rhs;
+ }
+ bool operator!=(const DiskLoc& rhs) const { return !(*this==rhs); }
+ bool isNull() const { return ofs < 0; }
+ void Null() {
+ ofs = OurNullOfs;
+ _a[0] = _a[1] = _a[2] = 0;
+ }
+ string toString() const { return DiskLoc(*this).toString(); }
+ void operator=(const DiskLoc& loc) {
+ ofs = loc.getOfs();
+ int la = loc.a();
+ assert( la <= 0xffffff ); // must fit in 3 bytes
+ if( la < 0 ) {
+ assert( la == -1 );
+ la = 0;
+ ofs = OurNullOfs;
+ }
+ memcpy(_a, &la, 3); // endian
+ dassert( ofs != 0 );
+ }
+ DiskLoc56Bit& writing() const {
+ return *((DiskLoc56Bit*) getDur().writingPtr((void*)this, 7));
+ }
+ };
+
+ class BtreeData_V1 {
+ public:
+ typedef DiskLoc56Bit Loc;
+ //typedef DiskLoc Loc;
+ typedef __KeyNode<Loc> _KeyNode;
+ typedef KeyV1 Key;
+ typedef KeyV1Owned KeyOwned;
+ enum { BucketSize = 8192-16 }; // leave room for Record header
+ // largest key size we allow. note we very much need to support bigger keys (somehow) in the future.
+ static const int KeyMax = 1024;
+ protected:
+ /** Parent bucket of this bucket, which isNull() for the root bucket. */
+ Loc parent;
+ /** Given that there are n keys, this is the n index child. */
+ Loc nextChild;
+
+ unsigned short flags;
+
+ /** basicInsert() assumes the next three members are consecutive and in this order: */
+
+ /** Size of the empty region. */
+ unsigned short emptySize;
+ /** Size used for bson storage, including storage of old keys. */
+ unsigned short topSize;
+ /* Number of keys in the bucket. */
+ unsigned short n;
+
+ /* Beginning of the bucket's body */
+ char data[4];
+
+ void _init() { }
+ };
+
+ typedef BtreeData_V0 V0;
+ typedef BtreeData_V1 V1;
+
+ /**
+ * This class adds functionality to BtreeData for managing a single bucket.
+ * The following policies are used in an attempt to encourage simplicity:
+ *
+ * Const member functions of this class are those which may be called on
+ * an object for which writing has not been signaled. Non const member
+ * functions may only be called on objects for which writing has been
+ * signaled. Note that currently some const functions write to the
+ * underlying memory representation of this bucket using optimized methods
+ * to signal write operations.
+ *
+ * DiskLoc parameters that may shadow references within the btree should
+ * be passed by value rather than by reference to non const member
+ * functions or to const member functions which may perform writes. This way
+ * a callee need not worry that write operations will change or invalidate
+ * its arguments.
+ *
+ * The current policy for dealing with bson arguments is the opposite of
+ * what is described above for DiskLoc arguments. We do not want to copy
+ * bson into memory as an intermediate step for btree changes, and if bson
+ * is to be moved it must be copied to the new location before the old
+ * location is invalidated. Care should be taken in cases where that invalid
+ * memory may be implicitly referenced by function arguments.
+ *
+ * A number of functions below require a thisLoc argument, which must be the
+ * disk location of the bucket mapped to 'this'.
+ */
+ template< class Version >
+ class BucketBasics : public Version {
+ public:
+ template <class U> friend class BtreeBuilder;
+ typedef typename Version::Key Key;
+ typedef typename Version::_KeyNode _KeyNode;
+ typedef typename Version::Loc Loc;
+
+ int getN() const { return this->n; }
+
+ /**
+ * This is an in memory wrapper for a _KeyNode, and not itself part of btree
+ * storage. This object and its BSONObj 'key' will become invalid if the
+ * _KeyNode data that generated it is moved within the btree. In general,
+ * a KeyNode should not be expected to be valid after a write.
+ */
+ class KeyNode {
+ public:
+ KeyNode(const BucketBasics<Version>& bb, const _KeyNode &k);
+ const Loc& prevChildBucket;
+ const Loc& recordLoc;
+ /* Points to the bson key storage for a _KeyNode */
+ Key key;
+ };
+ friend class KeyNode;
+
+ /** Assert write intent declared for this bucket already. */
+ void assertWritable();
+
+ void assertValid(const Ordering &order, bool force = false) const;
+ void assertValid(const BSONObj &orderObj, bool force = false) const { return assertValid(Ordering::make(orderObj),force); }
+
+ /**
+ * @return KeyNode for key at index i. The KeyNode will become invalid
+ * if the key is moved or reassigned, or if the node is packed. In general
+ * a KeyNode should not be expected to be valid after a write.
+ */
+ const KeyNode keyNode(int i) const {
+ if ( i >= this->n ) {
+ massert( 13000 , (string)"invalid keyNode: " + BSON( "i" << i << "n" << this->n ).jsonString() , i < this->n );
+ }
+ return KeyNode(*this, k(i));
+ }
+
+ static int headerSize() {
+ const BucketBasics *d = 0;
+ return (char*)&(d->data) - (char*)&(d->parent);
+ }
+ static int bodySize() { return Version::BucketSize - headerSize(); }
+ static int lowWaterMark() { return bodySize() / 2 - Version::KeyMax - sizeof( _KeyNode ) + 1; } // see comment in btree.cpp
+
+ // for testing
+ int nKeys() const { return this->n; }
+ const DiskLoc getNextChild() const { return this->nextChild; }
+
+ protected:
+ char * dataAt(short ofs) { return this->data + ofs; }
+
+ /** Initialize the header for a new node. */
+ void init();
+
+ /**
+ * Preconditions:
+ * - 0 <= keypos <= n
+ * - If key is inserted at position keypos, the bucket's keys will still be
+ * in order.
+ * Postconditions:
+ * - If key can fit in the bucket, the bucket may be packed and keypos
+ * may be decreased to reflect deletion of earlier indexed keys during
+ * packing, the key will be inserted at the updated keypos index with
+ * a null prevChildBucket, the subsequent keys shifted to the right,
+ * and the function will return true.
+ * - If key cannot fit in the bucket, the bucket will be packed and
+ * the function will return false.
+ * Although this function is marked const, it modifies the underlying
+ * btree representation through an optimized write intent mechanism.
+ */
+ bool basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const Key& key, const Ordering &order) const;
+
+ /**
+ * Preconditions:
+ * - key / recordLoc are > all existing keys
+ * - The keys in prevChild and their descendents are between all existing
+ * keys and 'key'.
+ * Postconditions:
+ * - If there is space for key without packing, it is inserted as the
+ * last key with specified prevChild and true is returned.
+ * Importantly, nextChild is not updated!
+ * - Otherwise false is returned and there is no change.
+ */
+ bool _pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild);
+ void pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild) {
+ bool ok = _pushBack( recordLoc , key , order , prevChild );
+ assert(ok);
+ }
+
+ /**
+ * This is a special purpose function used by BtreeBuilder. The
+ * interface is quite dangerous if you're not careful. The bson key
+ * returned here points to bucket memory that has been invalidated but
+ * not yet reclaimed.
+ *
+ * TODO Maybe this could be replaced with two functions, one which
+ * returns the last key without deleting it and another which simply
+ * deletes the last key. Then the caller would have enough control to
+ * ensure proper memory integrity.
+ *
+ * Preconditions:
+ * - bucket is not empty
+ * - last key of bucket is used (not unused)
+ * - nextChild isNull()
+ * - _unalloc will work correctly as used - see code
+ * Postconditions:
+ * - The last key of the bucket is removed, and its key and recLoc are
+ * returned. As mentioned above, the key points to unallocated memory.
+ */
+ void popBack(DiskLoc& recLoc, Key &key);
+
+ /**
+ * Preconditions:
+ * - 0 <= keypos < n
+ * - there is no child bucket at keypos
+ * - n > 1
+ * - if mayEmpty == false or nextChild.isNull(), n > 0
+ * Postconditions:
+ * - The key at keypos is removed, and remaining keys are shifted over.
+ * - The bucket becomes unpacked.
+ * - if mayEmpty is true and nextChild.isNull(), the bucket may have no keys.
+ */
+ void _delKeyAtPos(int keypos, bool mayEmpty = false);
+
+ /* !Packed means there is deleted fragment space within the bucket.
+ We "repack" when we run out of space before considering the node
+ to be full.
+ */
+ enum Flags { Packed=1 };
+
+ /** n == 0 is ok */
+ const Loc& childForPos(int p) const { return p == this->n ? this->nextChild : k(p).prevChildBucket; }
+ Loc& childForPos(int p) { return p == this->n ? this->nextChild : k(p).prevChildBucket; }
+
+ /** Same as bodySize(). */
+ int totalDataSize() const;
+ /**
+ * @return true when a key may be dropped by pack()
+ * @param index index of the key that may be dropped
+ * @param refPos index of a particular key of interest, which must not
+ * be dropped; = 0 to safely ignore
+ */
+ bool mayDropKey( int index, int refPos ) const;
+
+ /**
+ * Pack the bucket to reclaim space from invalidated memory.
+ * @refPos is an index in the bucket which may be updated if we
+ * delete keys from the bucket
+ * This function may cast away const and perform a write.
+ * Preconditions: none
+ * Postconditions:
+ * - Bucket will be packed
+ * - Some unused nodes may be dropped, but not ones at index 0 or refPos
+ * - Some used nodes may be moved
+ * - If refPos is the index of an existing key, it will be updated to that
+ * key's new index if the key is moved.
+ */
+ void _pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const;
+ /** Pack when already writable */
+ void _packReadyForMod(const Ordering &order, int &refPos);
+
+ /** @return the size the bucket's body would have if we were to call pack() */
+ int packedDataSize( int refPos ) const;
+ void setNotPacked() { this->flags &= ~Packed; }
+ void setPacked() { this->flags |= Packed; }
+ /**
+ * Preconditions: 'bytes' is <= emptySize
+ * Postconditions: A buffer of size 'bytes' is allocated on the top side,
+ * and its offset is returned.
+ */
+ int _alloc(int bytes);
+ /**
+ * This function can be used to deallocate the lowest byte index bson
+ * buffer in the top region, which in some but not all cases is for the
+ * n - 1 index key. This function only works correctly in certain
+ * special cases, please be careful.
+ * Preconditions: 'bytes' <= topSize
+ * Postconditions: The top region is decreased
+ */
+ void _unalloc(int bytes);
+ /**
+ * Preconditions: 'N' <= n
+ * Postconditions:
+ * - All keys after the N index key are dropped.
+ * - Then bucket is packed, without dropping refPos if < refPos N.
+ */
+ void truncateTo(int N, const Ordering &order, int &refPos);
+ /**
+ * Preconditions:
+ * - 'nDrop' < n
+ * - for now, refPos should be zero.
+ * Postconditions:
+ * - All keys before the nDrop index key are dropped.
+ * - The bucket is packed.
+ */
+ void dropFront(int nDrop, const Ordering &order, int &refPos);
+ /**
+ * Preconditions: 0 <= keypos < n
+ * Postconditions: keypos indexed key is marked unused.
+ */
+ void markUnused(int keypos);
+
+ /**
+ * BtreeBuilder uses the parent var as a temp place to maintain a linked list chain.
+ * we use tempNext() when we do that to be less confusing. (one might have written a union in C)
+ */
+ DiskLoc tempNext() const { return this->parent; }
+ void setTempNext(DiskLoc l) { this->parent = l; }
+
+ void _shape(int level, stringstream&) const;
+ int Size() const;
+
+ /** @return i-indexed _KeyNode, without bounds checking */
+ public:
+ const _KeyNode& k(int i) const { return ((const _KeyNode*)this->data)[i]; }
+ _KeyNode& _k(int i) { return ((_KeyNode*)this->data)[i]; }
+ protected:
+ _KeyNode& k(int i) { return ((_KeyNode*)this->data)[i]; }
+
+ /**
+ * Preconditions: 'this' is packed
+ * @return the key index to be promoted on split
+ * @param keypos The requested index of a key to insert, which may affect
+ * the choice of split position.
+ */
+ int splitPos( int keypos ) const;
+
+ /**
+ * Preconditions: nAdd * sizeof( _KeyNode ) <= emptySize
+ * Postconditions:
+ * - Increases indexes of existing _KeyNode objects by nAdd, reserving
+ * space for additional _KeyNode objects at front.
+ * - Does not initialize ofs values for the bson data of these
+ * _KeyNode objects.
+ */
+ void reserveKeysFront( int nAdd );
+
+ /**
+ * Preconditions:
+ * - 0 <= i < n
+ * - The bson 'key' must fit in the bucket without packing.
+ * - If 'key' and 'prevChildBucket' are set at index i, the btree
+ * ordering properties will be maintained.
+ * Postconditions:
+ * - The specified key is set at index i, replacing the existing
+ * _KeyNode data and without shifting any other _KeyNode objects.
+ */
+ void setKey( int i, const DiskLoc recordLoc, const Key& key, const DiskLoc prevChildBucket );
+ };
+
+ template< class V>
+ struct Continuation;
+
+ /**
+ * This class adds functionality for manipulating buckets that are assembled
+ * in a tree. The requirements for const and non const functions and
+ * arguments are generally the same as in BtreeBucket. Because this class
+ * deals with tree structure, some functions that are marked const may
+ * trigger modification of another node in the btree or potentially of the
+ * current node. In such cases, the function's implementation explicitly
+ * casts away const when indicating an intent to write to the durability
+ * layer. The DiskLocs provided to such functions should be passed by
+ * value if they shadow pointers within the btree.
+ *
+ * To clarify enforcement of referential integrity in this implementation,
+ * we use the following pattern when deleting data we have a persistent
+ * pointer to. The pointer is cleared or removed explicitly, then the data
+ * it pointed to is cleaned up with a helper function.
+ *
+ * TODO It might make sense to put some of these functions in a class
+ * representing a full btree instead of a single btree bucket. That would
+ * allow us to use the const qualifier in a manner more consistent with
+ * standard usage. Right now the interface is for both a node and a tree,
+ * so assignment of const is sometimes nonideal.
+ *
+ * TODO There are several cases in which the 'this' pointer is invalidated
+ * as a result of deallocation. A seperate class representing a btree would
+ * alleviate some fragile cases where the implementation must currently
+ * behave correctly if the 'this' pointer is suddenly invalidated by a
+ * callee.
+ */
+ template< class V >
+ class BtreeBucket : public BucketBasics<V> {
+ friend class BtreeCursor;
+ friend struct Continuation<V>;
+ public:
+ // make compiler happy:
+ typedef typename V::Key Key;
+ typedef typename V::KeyOwned KeyOwned;
+ typedef typename BucketBasics<V>::KeyNode KeyNode;
+ typedef typename BucketBasics<V>::_KeyNode _KeyNode;
+ typedef typename BucketBasics<V>::Loc Loc;
+ const _KeyNode& k(int i) const { return static_cast< const BucketBasics<V> * >(this)->k(i); }
+ protected:
+ _KeyNode& k(int i) { return static_cast< BucketBasics<V> * >(this)->_k(i); }
+ public:
+ const KeyNode keyNode(int i) const { return static_cast< const BucketBasics<V> * >(this)->keyNode(i); }
+
+ bool isHead() const { return this->parent.isNull(); }
+ void dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const;
+ long long fullValidate(const DiskLoc& thisLoc, const BSONObj &order, long long *unusedCount = 0, bool strict = false, unsigned depth=0) const; /* traverses everything */
+
+ bool isUsed( int i ) const { return this->k(i).isUsed(); }
+ string bucketSummary() const;
+ void dump(unsigned depth=0) const;
+
+ /**
+ * @return true if key exists in index
+ *
+ * @order - indicates order of keys in the index. this is basically the index's key pattern, e.g.:
+ * BSONObj order = ((IndexDetails&)idx).keyPattern();
+ * likewise below in bt_insert() etc.
+ */
+ private:
+ bool exists(const IndexDetails& idx, const DiskLoc &thisLoc, const Key& key, const Ordering& order) const;
+ public:
+
+ /**
+ * @param self - Don't complain about ourself already being in the index case.
+ * @return true = There is a duplicate used key.
+ */
+ bool wouldCreateDup(
+ const IndexDetails& idx, const DiskLoc &thisLoc,
+ const Key& key, const Ordering& order,
+ const DiskLoc &self) const;
+
+ /**
+ * Preconditions: none
+ * Postconditions: @return a new bucket allocated from pdfile storage
+ * and init()-ed. This bucket is suitable to for use as a new root
+ * or any other new node in the tree.
+ */
+ static DiskLoc addBucket(const IndexDetails&);
+
+ /**
+ * Preconditions: none
+ * Postconditions:
+ * - Some header values in this bucket are cleared, and the bucket is
+ * deallocated from pdfile storage.
+ * - The memory at thisLoc is invalidated, and 'this' is invalidated.
+ */
+ void deallocBucket(const DiskLoc thisLoc, const IndexDetails &id);
+
+ /**
+ * Preconditions:
+ * - 'key' has a valid schema for this index.
+ * - All other paramenters are valid and consistent with this index if applicable.
+ * Postconditions:
+ * - If key is bigger than KeyMax, @return 2 or 3 and no change.
+ * - If key / recordLoc exist in the btree as an unused key, set them
+ * as used and @return 0
+ * - If key / recordLoc exist in the btree as a used key, @throw
+ * exception 10287 and no change.
+ * - If key / recordLoc do not exist in the btree, they are inserted
+ * and @return 0. The root of the btree may be changed, so
+ * 'this'/thisLoc may no longer be the root upon return.
+ */
+ int bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+ const BSONObj& key, const Ordering &order, bool dupsAllowed,
+ IndexDetails& idx, bool toplevel = true) const;
+
+ /** does the insert in two steps - can then use an upgradable lock for step 1, which
+ is the part which may have page faults. also that step is most of the computational work.
+ */
+ void twoStepInsert(DiskLoc thisLoc, Continuation<V> &c, bool dupsAllowed) const;
+
+ /**
+ * Preconditions:
+ * - 'key' has a valid schema for this index, and may have objsize() > KeyMax.
+ * Postconditions:
+ * - If key / recordLoc are in the btree, they are removed (possibly
+ * by being marked as an unused key), @return true, and potentially
+ * invalidate 'this' / thisLoc and change the head.
+ * - If key / recordLoc are not in the btree, @return false and do nothing.
+ */
+ bool unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc) const;
+
+ /**
+ * locate may return an "unused" key that is just a marker. so be careful.
+ * looks for a key:recordloc pair.
+ *
+ * @found - returns true if exact match found. note you can get back a position
+ * result even if found is false.
+ */
+ DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order,
+ int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) const;
+ DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const Key& key, const Ordering &order,
+ int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) const;
+
+ /**
+ * find the first instance of the key
+ * does not handle dups
+ * WARNING: findSingle may not be compound index safe. this may need to change. see notes in
+ * findSingle code.
+ * @return the record location of the first match
+ */
+ DiskLoc findSingle( const IndexDetails &indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const;
+
+ /**
+ * Advance to next or previous key in the index.
+ * @param direction to advance.
+ */
+ DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const;
+
+ /** Advance in specified direction to the specified key */
+ void advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const;
+
+ /** Locate a key with fields comprised of a combination of keyBegin fields and keyEnd fields. */
+ static void customLocate(DiskLoc &locInOut, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) ;
+
+ /** @return head of the btree by traversing from current bucket. */
+ const DiskLoc getHead(const DiskLoc& thisLoc) const;
+
+ /** get tree shape */
+ void shape(stringstream&) const;
+
+ static void a_test(IndexDetails&);
+
+ static int getKeyMax();
+
+ protected:
+ /**
+ * Preconditions:
+ * - 0 <= firstIndex <= n
+ * - -1 <= lastIndex <= n ( -1 is equivalent to n )
+ * Postconditions:
+ * - Any children at indexes firstIndex through lastIndex (inclusive)
+ * will have their parent pointers set to thisLoc.
+ */
+ void fixParentPtrs(const DiskLoc thisLoc, int firstIndex = 0, int lastIndex = -1) const;
+
+ /**
+ * Preconditions:
+ * - thisLoc is not the btree head.
+ * - n == 0 is ok
+ * Postconditions:
+ * - All cursors pointing to this bucket will be updated.
+ * - This bucket's parent's child pointer is set to null.
+ * - This bucket is deallocated from pdfile storage.
+ * - 'this' and thisLoc are invalidated.
+ */
+ void delBucket(const DiskLoc thisLoc, const IndexDetails&);
+
+ /**
+ * Preconditions: 0 <= p < n
+ * Postconditions:
+ * - The key at index p is removed from the btree.
+ * - 'this' and thisLoc may be invalidated.
+ * - The tree head may change.
+ */
+ void delKeyAtPos(const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order);
+
+ /**
+ * Preconditions:
+ * - n == 0 is ok
+ * Postconditions:
+ * - If thisLoc is head, or if its body has at least lowWaterMark bytes,
+ * return false and do nothing.
+ * - Otherwise, if thisLoc has left or right neighbors, either balance
+ * or merge with them and return true. Also, 'this' and thisLoc may
+ * be invalidated and the tree head may change.
+ */
+ bool mayBalanceWithNeighbors(const DiskLoc thisLoc, IndexDetails &id, const Ordering &order) const;
+
+ /**
+ * Preconditions:
+ * - 0 <= leftIndex < n
+ * - The child at leftIndex or the child at leftIndex + 1 contains
+ * fewer than lowWaterMark bytes.
+ * Postconditions:
+ * - If the child bucket at leftIndex can merge with the child index
+ * at leftIndex + 1, do nothing and return false.
+ * - Otherwise, balance keys between the leftIndex child and the
+ * leftIndex + 1 child, return true, and possibly change the tree head.
+ */
+ bool tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const;
+
+ /**
+ * Preconditions:
+ * - All preconditions of tryBalanceChildren.
+ * - The leftIndex child and leftIndex + 1 child cannot be merged.
+ * Postconditions:
+ * - Keys are moved between the leftIndex child and the leftIndex + 1
+ * child such that neither child has fewer than lowWaterMark bytes.
+ * The tree head may change.
+ */
+ void doBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order );
+
+ /**
+ * Preconditions:
+ * - All preconditions of doBalanceChildren
+ * - The leftIndex and leftIndex + 1 children are packed.
+ * - The leftIndex + 1 child has fewer than lowWaterMark bytes.
+ * - split returned by rebalancedSeparatorPos()
+ * Postconditions:
+ * - The key in lchild at index split is set as thisLoc's key at index
+ * leftIndex, which may trigger a split and change the tree head.
+ * The previous key in thisLoc at index leftIndex and all keys with
+ * indexes greater than split in lchild are moved to rchild.
+ */
+ void doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split,
+ BtreeBucket<V> *l, const DiskLoc lchild,
+ BtreeBucket<V> *r, const DiskLoc rchild,
+ IndexDetails &id, const Ordering &order );
+ /**
+ * Preconditions:
+ * - All preconditions of doBalanceChildren
+ * - The leftIndex and leftIndex + 1 children are packed.
+ * - The leftIndex child has fewer than lowWaterMark bytes.
+ * - split returned by rebalancedSeparatorPos()
+ * Postconditions:
+ * - The key in rchild at index split - l->n - 1 is set as thisLoc's key
+ * at index leftIndex, which may trigger a split and change the tree
+ * head. The previous key in thisLoc at index leftIndex and all keys
+ * with indexes less than split - l->n - 1 in rchild are moved to
+ * lchild.
+ */
+ void doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split,
+ BtreeBucket<V> *l, const DiskLoc lchild,
+ BtreeBucket<V> *r, const DiskLoc rchild,
+ IndexDetails &id, const Ordering &order );
+
+ /**
+ * Preconditions:
+ * - 0 <= leftIndex < n
+ * - this->canMergeChildren( thisLoc, leftIndex ) == true
+ * Postconditions:
+ * - All of the above mentioned keys will be placed in the left child.
+ * - The tree may be updated recursively, resulting in 'this' and
+ * thisLoc being invalidated and the tree head being changed.
+ */
+ void doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order);
+
+ /**
+ * Preconditions:
+ * - n == 0
+ * - !nextChild.isNull()
+ * Postconditions:
+ * - 'this' and thisLoc are deallocated (and invalidated), any cursors
+ * to them are updated, and the tree head may change.
+ * - nextChild replaces thisLoc in the btree structure.
+ */
+ void replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id );
+
+ /**
+ * @return true iff the leftIndex and leftIndex + 1 children both exist,
+ * and if their body sizes when packed and the thisLoc key at leftIndex
+ * would fit in a single bucket body.
+ */
+ bool canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const;
+
+ /**
+ * Preconditions:
+ * - leftIndex and leftIndex + 1 children are packed
+ * - leftIndex or leftIndex + 1 child is below lowWaterMark
+ * @return index of the rebalanced separator; the index value is
+ * determined as if we had a bucket with body
+ * <left bucket keys array>.push( <old separator> ).concat( <right bucket keys array> )
+ * and called splitPos( 0 ) on it.
+ */
+ int rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const;
+
+ /**
+ * Preconditions: thisLoc has a parent
+ * @return parent's index of thisLoc.
+ */
+ int indexInParent( const DiskLoc &thisLoc ) const;
+
+ public:
+ Key keyAt(int i) const {
+ if( i >= this->n )
+ return Key();
+ return Key(this->data + k(i).keyDataOfs());
+ }
+ protected:
+
+ /**
+ * Preconditions:
+ * - This bucket is packed.
+ * - Cannot add a key of size KeyMax to this bucket.
+ * - 0 <= keypos <= n is the position of a new key that will be inserted
+ * - lchild is equal to the existing child at index keypos.
+ * Postconditions:
+ * - The thisLoc bucket is split into two packed buckets, possibly
+ * invalidating the initial position of keypos, with a split key
+ * promoted to the parent. The new key key/recordLoc will be inserted
+ * into one of the split buckets, and lchild/rchild set appropriately.
+ * Splitting may occur recursively, possibly changing the tree head.
+ */
+ void split(const DiskLoc thisLoc, int keypos,
+ const DiskLoc recordLoc, const Key& key,
+ const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx);
+
+ /**
+ * Preconditions:
+ * - 0 <= keypos <= n
+ * - If key / recordLoc are inserted at position keypos, with provided
+ * lchild and rchild, the btree ordering requirements will be
+ * maintained.
+ * - lchild is equal to the existing child at index keypos.
+ * - n == 0 is ok.
+ * Postconditions:
+ * - The key / recordLoc are inserted at position keypos, and the
+ * bucket is split if necessary, which may change the tree head.
+ * - The bucket may be packed or split, invalidating the specified value
+ * of keypos.
+ * This function will always modify thisLoc, but it's marked const because
+ * it commonly relies on the specialized writ]e intent mechanism of basicInsert().
+ */
+ void insertHere(const DiskLoc thisLoc, int keypos,
+ const DiskLoc recordLoc, const Key& key, const Ordering &order,
+ const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx) const;
+
+ /** bt_insert() is basically just a wrapper around this. */
+ int _insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+ const Key& key, const Ordering &order, bool dupsAllowed,
+ const DiskLoc lChild, const DiskLoc rChild, IndexDetails &idx) const;
+
+ void insertStepOne(DiskLoc thisLoc, Continuation<V>& c, bool dupsAllowed) const;
+
+ bool find(const IndexDetails& idx, const Key& key, const DiskLoc &recordLoc, const Ordering &order, int& pos, bool assertIfDup) const;
+ static bool customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) ;
+ static void findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey);
+ static int customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction );
+
+ /** If child is non null, set its parent to thisLoc */
+ static void fix(const DiskLoc thisLoc, const DiskLoc child);
+
+ /**
+ * Preconditions:
+ * - 0 <= keypos < n
+ * - If the specified key and recordLoc are placed in keypos of thisLoc,
+ * and lchild and rchild are set, the btree ordering properties will
+ * be maintained.
+ * - rchild == childForPos( keypos + 1 )
+ * - childForPos( keypos ) is referenced elsewhere if nonnull.
+ * Postconditions:
+ * - The key at keypos will be replaced with the specified key and
+ * lchild, potentially splitting this bucket and changing the tree
+ * head.
+ * - childForPos( keypos ) will be orphaned.
+ */
+ void setInternalKey( const DiskLoc thisLoc, int keypos,
+ const DiskLoc recordLoc, const Key &key, const Ordering &order,
+ const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx);
+
+ /**
+ * Preconditions:
+ * - 0 <= keypos < n
+ * - The keypos or keypos+1 indexed child is non null.
+ * Postconditions:
+ * - The specified key is deleted by replacing it with another key if
+ * possible. This replacement may cause a split and change the tree
+ * head. The replacement key will be deleted from its original
+ * location, potentially causing merges and splits that may invalidate
+ * 'this' and thisLoc and change the tree head.
+ * - If the key cannot be replaced, it will be marked as unused. This
+ * is only expected in legacy btrees.
+ */
+ void deleteInternalKey( const DiskLoc thisLoc, int keypos, IndexDetails &id, const Ordering &order );
+ public:
+ /** simply builds and returns a dup key error message string */
+ static string dupKeyError( const IndexDetails& idx , const Key& key );
+ };
+#pragma pack()
+
+ class FieldRangeVector;
+ class FieldRangeVectorIterator;
+
+ class BtreeCursor : public Cursor {
+ protected:
+ BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction );
+ BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction );
+ public:
+ virtual ~BtreeCursor();
+ /** makes an appropriate subclass depending on the index version */
+ static BtreeCursor* make( NamespaceDetails *_d, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction );
+ static BtreeCursor* make( NamespaceDetails *_d, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction );
+ static BtreeCursor* make( NamespaceDetails *_d, int _idxNo, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction );
+ static BtreeCursor* make( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction );
+
+ virtual bool ok() { return !bucket.isNull(); }
+ virtual bool advance();
+ virtual void noteLocation(); // updates keyAtKeyOfs...
+ virtual void checkLocation() = 0;
+ virtual bool supportGetMore() { return true; }
+ virtual bool supportYields() { return true; }
+
+ /**
+ * used for multikey index traversal to avoid sending back dups. see Matcher::matches().
+ * if a multikey index traversal:
+ * if loc has already been sent, returns true.
+ * otherwise, marks loc as sent.
+ * @return false if the loc has not been seen
+ */
+ virtual bool getsetdup(DiskLoc loc) {
+ if( _multikey ) {
+ pair<set<DiskLoc>::iterator, bool> p = _dups.insert(loc);
+ return !p.second;
+ }
+ return false;
+ }
+
+ virtual bool modifiedKeys() const { return _multikey; }
+ virtual bool isMultiKey() const { return _multikey; }
+
+ /*const _KeyNode& _currKeyNode() const {
+ assert( !bucket.isNull() );
+ const _KeyNode& kn = keyNode(keyOfs);
+ assert( kn.isUsed() );
+ return kn;
+ }*/
+
+ /** returns BSONObj() if ofs is out of range */
+ virtual BSONObj keyAt(int ofs) const = 0;
+
+ virtual BSONObj currKey() const = 0;
+ virtual BSONObj indexKeyPattern() { return indexDetails.keyPattern(); }
+
+ virtual void aboutToDeleteBucket(const DiskLoc& b) {
+ if ( bucket == b )
+ keyOfs = -1;
+ }
+
+ virtual DiskLoc currLoc() = 0; // { return !bucket.isNull() ? _currKeyNode().recordLoc : DiskLoc(); }
+ virtual DiskLoc refLoc() { return currLoc(); }
+ virtual Record* _current() { return currLoc().rec(); }
+ virtual BSONObj current() { return BSONObj(_current()); }
+ virtual string toString();
+
+ BSONObj prettyKey( const BSONObj &key ) const {
+ return key.replaceFieldNames( indexDetails.keyPattern() ).clientReadable();
+ }
+
+ virtual BSONObj prettyIndexBounds() const;
+
+ virtual CoveredIndexMatcher *matcher() const { return _matcher.get(); }
+ virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { return _matcher; }
+
+ virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) { _matcher = matcher; }
+
+ virtual long long nscanned() { return _nscanned; }
+
+ /** for debugging only */
+ const DiskLoc getBucket() const { return bucket; }
+ int getKeyOfs() const { return keyOfs; }
+
+ // just for unit tests
+ virtual bool curKeyHasChild() = 0;
+
+ protected:
+ /**
+ * Our btrees may (rarely) have "unused" keys when items are deleted.
+ * Skip past them.
+ */
+ virtual bool skipUnusedKeys() = 0;
+
+ bool skipOutOfRangeKeysAndCheckEnd();
+ void skipAndCheck();
+ void checkEnd();
+
+ /** selective audits on construction */
+ void audit();
+
+ virtual void _audit() = 0;
+ virtual DiskLoc _locate(const BSONObj& key, const DiskLoc& loc) = 0;
+ virtual DiskLoc _advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) = 0;
+ virtual void _advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) = 0;
+
+ /** set initial bucket */
+ void initWithoutIndependentFieldRanges();
+
+ /** if afterKey is true, we want the first key with values of the keyBegin fields greater than keyBegin */
+ void advanceTo( const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive );
+
+ set<DiskLoc> _dups;
+ NamespaceDetails * const d;
+ const int idxNo;
+ BSONObj startKey;
+ BSONObj endKey;
+ bool _endKeyInclusive;
+ bool _multikey; // this must be updated every getmore batch in case someone added a multikey
+ const IndexDetails& indexDetails;
+ const BSONObj _order;
+ const Ordering _ordering;
+ DiskLoc bucket;
+ int keyOfs;
+ const int _direction; // 1=fwd,-1=reverse
+ BSONObj keyAtKeyOfs; // so we can tell if things moved around on us between the query and the getMore call
+ DiskLoc locAtKeyOfs;
+ const shared_ptr< FieldRangeVector > _bounds;
+ auto_ptr< FieldRangeVectorIterator > _boundsIterator;
+ shared_ptr< CoveredIndexMatcher > _matcher;
+ bool _independentFieldRanges;
+ long long _nscanned;
+ };
+
+ template< class V >
+ struct Continuation {
+ //Continuation(const typename V::Key & k);
+ Continuation(DiskLoc thisLoc, DiskLoc _recordLoc, const BSONObj &_key,
+ Ordering _order, IndexDetails& _idx) :
+ bLoc(thisLoc), recordLoc(_recordLoc), key(_key), order(_order), idx(_idx) {
+ op = Nothing;
+ }
+
+ DiskLoc bLoc;
+ DiskLoc recordLoc;
+ typename V::KeyOwned key;
+ const Ordering order;
+ IndexDetails& idx;
+ enum Op { Nothing, SetUsed, InsertHere } op;
+
+ int pos;
+ const BtreeBucket<V> *b;
+
+ void stepTwo() {
+ if( op == Nothing )
+ return;
+ else if( op == SetUsed ) {
+ const typename V::_KeyNode& kn = b->k(pos);
+ kn.writing().setUsed();
+ }
+ else {
+ b->insertHere(bLoc, pos, recordLoc, key, order, DiskLoc(), DiskLoc(), idx);
+ }
+ }
+ };
+
+ /** Renames the index namespace for this btree's index. */
+ void renameIndexNamespace(const char *oldNs, const char *newNs);
+
+ /**
+ * give us a writable version of the btree bucket (declares write intent).
+ * note it is likely more efficient to declare write intent on something smaller when you can.
+ */
+ template< class V >
+ BtreeBucket<V> * DiskLoc::btreemod() const {
+ assert( _a != -1 );
+ BtreeBucket<V> *b = const_cast< BtreeBucket<V> * >( btree<V>() );
+ return static_cast< BtreeBucket<V>* >( getDur().writingPtr( b, V::BucketSize ) );
+ }
+
+ template< class V >
+ BucketBasics<V>::KeyNode::KeyNode(const BucketBasics<V>& bb, const _KeyNode &k) :
+ prevChildBucket(k.prevChildBucket),
+ recordLoc(k.recordLoc), key(bb.data+k.keyDataOfs())
+ { }
+
+} // namespace mongo;
diff --git a/src/mongo/db/btreebuilder.cpp b/src/mongo/db/btreebuilder.cpp
new file mode 100644
index 00000000000..0ec587a1958
--- /dev/null
+++ b/src/mongo/db/btreebuilder.cpp
@@ -0,0 +1,184 @@
+// btreebuilder.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "db.h"
+#include "btree.h"
+#include "pdfile.h"
+#include "json.h"
+#include "clientcursor.h"
+#include "client.h"
+#include "dbhelpers.h"
+#include "curop-inl.h"
+#include "stats/counters.h"
+#include "dur_commitjob.h"
+#include "btreebuilder.h"
+
+namespace mongo {
+
+ /* --- BtreeBuilder --- */
+
+ template<class V>
+ BtreeBuilder<V>::BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx) :
+ dupsAllowed(_dupsAllowed),
+ idx(_idx),
+ n(0),
+ order( idx.keyPattern() ),
+ ordering( Ordering::make(idx.keyPattern()) ) {
+ first = cur = BtreeBucket<V>::addBucket(idx);
+ b = cur.btreemod<V>();
+ committed = false;
+ }
+
+ template<class V>
+ void BtreeBuilder<V>::newBucket() {
+ DiskLoc L = BtreeBucket<V>::addBucket(idx);
+ b->setTempNext(L);
+ cur = L;
+ b = cur.btreemod<V>();
+ }
+
+ template<class V>
+ void BtreeBuilder<V>::mayCommitProgressDurably() {
+ if ( getDur().commitIfNeeded() ) {
+ b = cur.btreemod<V>();
+ }
+ }
+
+ template<class V>
+ void BtreeBuilder<V>::addKey(BSONObj& _key, DiskLoc loc) {
+
+ auto_ptr< KeyOwned > key( new KeyOwned(_key) );
+ if ( key->dataSize() > BtreeBucket<V>::KeyMax ) {
+ problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace()
+ << ' ' << key->dataSize() << ' ' << key->toString() << endl;
+ return;
+ }
+
+ if( !dupsAllowed ) {
+ if( n > 0 ) {
+ int cmp = keyLast->woCompare(*key, ordering);
+ massert( 10288 , "bad key order in BtreeBuilder - server internal error", cmp <= 0 );
+ if( cmp == 0 ) {
+ //if( !dupsAllowed )
+ uasserted( ASSERT_ID_DUPKEY , BtreeBucket<V>::dupKeyError( idx , *keyLast ) );
+ }
+ }
+ }
+
+ if ( ! b->_pushBack(loc, *key, ordering, DiskLoc()) ) {
+ // bucket was full
+ newBucket();
+ b->pushBack(loc, *key, ordering, DiskLoc());
+ }
+ keyLast = key;
+ n++;
+ mayCommitProgressDurably();
+ }
+
+ template<class V>
+ void BtreeBuilder<V>::buildNextLevel(DiskLoc loc) {
+ int levels = 1;
+ while( 1 ) {
+ if( loc.btree<V>()->tempNext().isNull() ) {
+ // only 1 bucket at this level. we are done.
+ getDur().writingDiskLoc(idx.head) = loc;
+ break;
+ }
+ levels++;
+
+ DiskLoc upLoc = BtreeBucket<V>::addBucket(idx);
+ DiskLoc upStart = upLoc;
+ BtreeBucket<V> *up = upLoc.btreemod<V>();
+
+ DiskLoc xloc = loc;
+ while( !xloc.isNull() ) {
+ if ( getDur().commitIfNeeded() ) {
+ b = cur.btreemod<V>();
+ up = upLoc.btreemod<V>();
+ }
+
+ BtreeBucket<V> *x = xloc.btreemod<V>();
+ Key k;
+ DiskLoc r;
+ x->popBack(r,k);
+ bool keepX = ( x->n != 0 );
+ DiskLoc keepLoc = keepX ? xloc : x->nextChild;
+
+ if ( ! up->_pushBack(r, k, ordering, keepLoc) ) {
+ // current bucket full
+ DiskLoc n = BtreeBucket<V>::addBucket(idx);
+ up->setTempNext(n);
+ upLoc = n;
+ up = upLoc.btreemod<V>();
+ up->pushBack(r, k, ordering, keepLoc);
+ }
+
+ DiskLoc nextLoc = x->tempNext(); // get next in chain at current level
+ if ( keepX ) {
+ x->parent = upLoc;
+ }
+ else {
+ if ( !x->nextChild.isNull() ) {
+ DiskLoc ll = x->nextChild;
+ ll.btreemod<V>()->parent = upLoc;
+ //(x->nextChild.btreemod<V>())->parent = upLoc;
+ }
+ x->deallocBucket( xloc, idx );
+ }
+ xloc = nextLoc;
+ }
+
+ loc = upStart;
+ mayCommitProgressDurably();
+ }
+
+ if( levels > 1 )
+ log(2) << "btree levels: " << levels << endl;
+ }
+
+ /** when all addKeys are done, we then build the higher levels of the tree */
+ template<class V>
+ void BtreeBuilder<V>::commit() {
+ buildNextLevel(first);
+ committed = true;
+ }
+
+ template<class V>
+ BtreeBuilder<V>::~BtreeBuilder() {
+ DESTRUCTOR_GUARD(
+ if( !committed ) {
+ log(2) << "Rolling back partially built index space" << endl;
+ DiskLoc x = first;
+ while( !x.isNull() ) {
+ DiskLoc next = x.btree<V>()->tempNext();
+ string ns = idx.indexNamespace();
+ theDataFileMgr._deleteRecord(nsdetails(ns.c_str()), ns.c_str(), x.rec(), x);
+ x = next;
+ getDur().commitIfNeeded();
+ }
+ assert( idx.head.isNull() );
+ log(2) << "done rollback" << endl;
+ }
+ )
+ }
+
+ template class BtreeBuilder<V0>;
+ template class BtreeBuilder<V1>;
+
+}
diff --git a/src/mongo/db/btreebuilder.h b/src/mongo/db/btreebuilder.h
new file mode 100644
index 00000000000..6de55d89299
--- /dev/null
+++ b/src/mongo/db/btreebuilder.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include "btree.h"
+
+namespace mongo {
+
+ /**
+ * build btree from the bottom up
+ */
+ template< class V >
+ class BtreeBuilder {
+ typedef typename V::KeyOwned KeyOwned;
+ typedef typename V::Key Key;
+
+ bool dupsAllowed;
+ IndexDetails& idx;
+ /** Number of keys added to btree. */
+ unsigned long long n;
+ /** Last key passed to addKey(). */
+ auto_ptr< typename V::KeyOwned > keyLast;
+ BSONObj order;
+ Ordering ordering;
+ /** true iff commit() completed successfully. */
+ bool committed;
+
+ DiskLoc cur, first;
+ BtreeBucket<V> *b;
+
+ void newBucket();
+ void buildNextLevel(DiskLoc);
+ void mayCommitProgressDurably();
+
+ public:
+ ~BtreeBuilder();
+
+ BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx);
+
+ /**
+ * Preconditions: 'key' is > or >= last key passed to this function (depends on _dupsAllowed)
+ * Postconditions: 'key' is added to intermediate storage.
+ */
+ void addKey(BSONObj& key, DiskLoc loc);
+
+ /**
+ * commit work. if not called, destructor will clean up partially completed work
+ * (in case exception has happened).
+ */
+ void commit();
+
+ unsigned long long getn() { return n; }
+ };
+
+}
diff --git a/src/mongo/db/btreecursor.cpp b/src/mongo/db/btreecursor.cpp
new file mode 100644
index 00000000000..7ddd4874ef6
--- /dev/null
+++ b/src/mongo/db/btreecursor.cpp
@@ -0,0 +1,457 @@
+// btreecursor.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "btree.h"
+#include "pdfile.h"
+#include "jsobj.h"
+#include "curop-inl.h"
+#include "queryutil.h"
+
+namespace mongo {
+
+ template< class V >
+ class BtreeCursorImpl : public BtreeCursor {
+ public:
+ typedef typename BucketBasics<V>::KeyNode KeyNode;
+ typedef typename V::Key Key;
+ typedef typename V::_KeyNode _KeyNode;
+
+ BtreeCursorImpl(NamespaceDetails *a, int b, const IndexDetails& c, const BSONObj &d, const BSONObj &e, bool f, int g) :
+ BtreeCursor(a,b,c,d,e,f,g) { }
+ BtreeCursorImpl(NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction ) :
+ BtreeCursor(_d,_idxNo,_id,_bounds,_direction )
+ {
+ pair< DiskLoc, int > noBestParent;
+ indexDetails.head.btree<V>()->customLocate( bucket, keyOfs, startKey, 0, false, _boundsIterator->cmp(), _boundsIterator->inc(), _ordering, _direction, noBestParent );
+ skipAndCheck();
+ dassert( _dups.size() == 0 );
+ }
+
+ virtual DiskLoc currLoc() {
+ if( bucket.isNull() ) return DiskLoc();
+ return currKeyNode().recordLoc;
+ }
+
+ virtual BSONObj keyAt(int ofs) const {
+ assert( !bucket.isNull() );
+ const BtreeBucket<V> *b = bucket.btree<V>();
+ int n = b->getN();
+ if( n == 0xffff ) {
+ throw UserException(15850, "keyAt bucket deleted");
+ }
+ dassert( n >= 0 && n < 10000 );
+ return ofs >= n ? BSONObj() : b->keyNode(ofs).key.toBson();
+ }
+
+ virtual BSONObj currKey() const {
+ assert( !bucket.isNull() );
+ return bucket.btree<V>()->keyNode(keyOfs).key.toBson();
+ }
+
+ virtual bool curKeyHasChild() {
+ return !currKeyNode().prevChildBucket.isNull();
+ }
+
+ bool skipUnusedKeys() {
+ int u = 0;
+ while ( 1 ) {
+ if ( !ok() )
+ break;
+ const _KeyNode& kn = keyNode(keyOfs);
+ if ( kn.isUsed() )
+ break;
+ bucket = _advance(bucket, keyOfs, _direction, "skipUnusedKeys");
+ u++;
+ //don't include unused keys in nscanned
+ //++_nscanned;
+ }
+ if ( u > 10 )
+ OCCASIONALLY log() << "btree unused skipped:" << u << '\n';
+ return u;
+ }
+
+ /* Since the last noteLocation(), our key may have moved around, and that old cached
+ information may thus be stale and wrong (although often it is right). We check
+ that here; if we have moved, we have to search back for where we were at.
+
+ i.e., after operations on the index, the BtreeCursor's cached location info may
+ be invalid. This function ensures validity, so you should call it before using
+ the cursor if other writers have used the database since the last noteLocation
+ call.
+ */
+ void checkLocation() {
+ if ( eof() )
+ return;
+
+ _multikey = d->isMultikey(idxNo);
+
+ if ( keyOfs >= 0 ) {
+ assert( !keyAtKeyOfs.isEmpty() );
+
+ try {
+ // Note keyAt() returns an empty BSONObj if keyOfs is now out of range,
+ // which is possible as keys may have been deleted.
+ int x = 0;
+ while( 1 ) {
+ // if ( b->keyAt(keyOfs).woEqual(keyAtKeyOfs) &&
+ // b->k(keyOfs).recordLoc == locAtKeyOfs ) {
+ if ( keyAt(keyOfs).binaryEqual(keyAtKeyOfs) ) {
+ const _KeyNode& kn = keyNode(keyOfs);
+ if( kn.recordLoc == locAtKeyOfs ) {
+ if ( !kn.isUsed() ) {
+ // we were deleted but still exist as an unused
+ // marker key. advance.
+ skipUnusedKeys();
+ }
+ return;
+ }
+ }
+
+ // we check one key earlier too, in case a key was just deleted. this is
+ // important so that multi updates are reasonably fast.
+ if( keyOfs == 0 || x++ )
+ break;
+ keyOfs--;
+ }
+ }
+ catch(UserException& e) {
+ if( e.getCode() != 15850 )
+ throw;
+ // hack: fall through if bucket was just deleted. should only happen under deleteObjects()
+ DEV log() << "debug info: bucket was deleted" << endl;
+ }
+ }
+
+ /* normally we don't get to here. when we do, old position is no longer
+ valid and we must refind where we left off (which is expensive)
+ */
+
+ /* TODO: Switch to keep indexdetails and do idx.head! */
+ bucket = _locate(keyAtKeyOfs, locAtKeyOfs);
+ RARELY log() << "key seems to have moved in the index, refinding. " << bucket.toString() << endl;
+ if ( ! bucket.isNull() )
+ skipUnusedKeys();
+
+ }
+
+ protected:
+ virtual void _advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) {
+ thisLoc.btree<V>()->advanceTo(thisLoc, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction);
+ }
+ virtual DiskLoc _advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) {
+ return thisLoc.btree<V>()->advance(thisLoc, keyOfs, direction, caller);
+ }
+ virtual void _audit() {
+ out() << "BtreeCursor(). dumping head bucket" << endl;
+ indexDetails.head.btree<V>()->dump();
+ }
+ virtual DiskLoc _locate(const BSONObj& key, const DiskLoc& loc) {
+ bool found;
+ return indexDetails.head.btree<V>()->
+ locate(indexDetails, indexDetails.head, key, _ordering, keyOfs, found, loc, _direction);
+ }
+
+ const _KeyNode& keyNode(int keyOfs) const {
+ return bucket.btree<V>()->k(keyOfs);
+ }
+
+ private:
+ const KeyNode currKeyNode() const {
+ assert( !bucket.isNull() );
+ const BtreeBucket<V> *b = bucket.btree<V>();
+ return b->keyNode(keyOfs);
+ }
+ };
+
+ template class BtreeCursorImpl<V0>;
+ template class BtreeCursorImpl<V1>;
+
+ /*
+ class BtreeCursorV1 : public BtreeCursor {
+ public:
+ typedef BucketBasics<V1>::KeyNode KeyNode;
+ typedef V1::Key Key;
+
+ BtreeCursorV1(NamespaceDetails *a, int b, const IndexDetails& c, const BSONObj &d, const BSONObj &e, bool f, int g) :
+ BtreeCursor(a,b,c,d,e,f,g) { }
+ BtreeCursorV1(NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction) :
+ BtreeCursor(_d,_idxNo,_id,_bounds,_direction)
+ {
+ pair< DiskLoc, int > noBestParent;
+ indexDetails.head.btree<V1>()->customLocate( bucket, keyOfs, startKey, 0, false, _boundsIterator->cmp(), _boundsIterator->inc(), _ordering, _direction, noBestParent );
+ skipAndCheck();
+ dassert( _dups.size() == 0 );
+ }
+
+ virtual DiskLoc currLoc() {
+ if( bucket.isNull() ) return DiskLoc();
+ return currKeyNode().recordLoc;
+ }
+
+ virtual BSONObj currKey() const {
+ assert( !bucket.isNull() );
+ return bucket.btree<V1>()->keyNode(keyOfs).key.toBson();
+ }
+
+ protected:
+ virtual void _advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) {
+ thisLoc.btree<V1>()->advanceTo(thisLoc, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction);
+ }
+ virtual DiskLoc _advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) {
+ return thisLoc.btree<V1>()->advance(thisLoc, keyOfs, direction, caller);
+ }
+ virtual void _audit() {
+ out() << "BtreeCursor(). dumping head bucket" << endl;
+ indexDetails.head.btree<V1>()->dump();
+ }
+ virtual DiskLoc _locate(const BSONObj& key, const DiskLoc& loc);
+ virtual const _KeyNode& keyNode(int keyOfs) {
+ return bucket.btree<V1>()->k(keyOfs);
+ }
+
+ private:
+ const KeyNode currKeyNode() const {
+ assert( !bucket.isNull() );
+ const BtreeBucket<V1> *b = bucket.btree<V1>();
+ return b->keyNode(keyOfs);
+ }
+ };*/
+
+ BtreeCursor* BtreeCursor::make(
+ NamespaceDetails *_d, const IndexDetails& _id,
+ const shared_ptr< FieldRangeVector > &_bounds, int _direction )
+ {
+ return make( _d, _d->idxNo( (IndexDetails&) _id), _id, _bounds, _direction );
+ }
+
+ BtreeCursor* BtreeCursor::make(
+ NamespaceDetails *_d, const IndexDetails& _id,
+ const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction)
+ {
+ return make( _d, _d->idxNo( (IndexDetails&) _id), _id, startKey, endKey, endKeyInclusive, direction );
+ }
+
+
+ BtreeCursor* BtreeCursor::make(
+ NamespaceDetails *_d, int _idxNo, const IndexDetails& _id,
+ const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction)
+ {
+ int v = _id.version();
+ BtreeCursor *c = 0;
+ if( v == 1 ) {
+ c = new BtreeCursorImpl<V1>(_d,_idxNo,_id,startKey,endKey,endKeyInclusive,direction);
+ }
+ else if( v == 0 ) {
+ c = new BtreeCursorImpl<V0>(_d,_idxNo,_id,startKey,endKey,endKeyInclusive,direction);
+ }
+ else {
+ uasserted(14800, str::stream() << "unsupported index version " << v);
+ }
+ c->initWithoutIndependentFieldRanges();
+ dassert( c->_dups.size() == 0 );
+ return c;
+ }
+
+ BtreeCursor* BtreeCursor::make(
+ NamespaceDetails *_d, int _idxNo, const IndexDetails& _id,
+ const shared_ptr< FieldRangeVector > &_bounds, int _direction )
+ {
+ int v = _id.version();
+ if( v == 1 )
+ return new BtreeCursorImpl<V1>(_d,_idxNo,_id,_bounds,_direction);
+ if( v == 0 )
+ return new BtreeCursorImpl<V0>(_d,_idxNo,_id,_bounds,_direction);
+ uasserted(14801, str::stream() << "unsupported index version " << v);
+
+ // just check we are in sync with this method
+ dassert( IndexDetails::isASupportedIndexVersionNumber(v) );
+
+ return 0;
+ }
+
+ BtreeCursor::BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails &_id,
+ const BSONObj &_startKey, const BSONObj &_endKey, bool endKeyInclusive, int _direction ) :
+ d(_d), idxNo(_idxNo),
+ startKey( _startKey ),
+ endKey( _endKey ),
+ _endKeyInclusive( endKeyInclusive ),
+ _multikey( d->isMultikey( idxNo ) ),
+ indexDetails( _id ),
+ _order( _id.keyPattern() ),
+ _ordering( Ordering::make( _order ) ),
+ _direction( _direction ),
+ _independentFieldRanges( false ),
+ _nscanned( 0 ) {
+ audit();
+ }
+
+ BtreeCursor::BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction )
+ :
+ d(_d), idxNo(_idxNo),
+ _endKeyInclusive( true ),
+ _multikey( d->isMultikey( idxNo ) ),
+ indexDetails( _id ),
+ _order( _id.keyPattern() ),
+ _ordering( Ordering::make( _order ) ),
+ _direction( _direction ),
+ _bounds( ( assert( _bounds.get() ), _bounds ) ),
+ _boundsIterator( new FieldRangeVectorIterator( *_bounds ) ),
+ _independentFieldRanges( true ),
+ _nscanned( 0 ) {
+ audit();
+ startKey = _bounds->startKey();
+ _boundsIterator->advance( startKey ); // handles initialization
+ _boundsIterator->prepDive();
+ bucket = indexDetails.head;
+ keyOfs = 0;
+ }
+
+ /** Properly destroy forward declared class members. */
+ BtreeCursor::~BtreeCursor() {}
+
+ void BtreeCursor::audit() {
+ dassert( d->idxNo((IndexDetails&) indexDetails) == idxNo );
+ }
+
+ void BtreeCursor::initWithoutIndependentFieldRanges() {
+ if ( indexDetails.getSpec().getType() ) {
+ startKey = indexDetails.getSpec().getType()->fixKey( startKey );
+ endKey = indexDetails.getSpec().getType()->fixKey( endKey );
+ }
+ bucket = _locate(startKey, _direction > 0 ? minDiskLoc : maxDiskLoc);
+ if ( ok() ) {
+ _nscanned = 1;
+ }
+ skipUnusedKeys();
+ checkEnd();
+ }
+
+ void BtreeCursor::skipAndCheck() {
+ long long startNscanned = _nscanned;
+ skipUnusedKeys();
+ while( 1 ) {
+ if ( !skipOutOfRangeKeysAndCheckEnd() ) {
+ break;
+ }
+ do {
+ if ( _nscanned > startNscanned + 20 ) {
+ skipUnusedKeys();
+ return;
+ }
+ } while( skipOutOfRangeKeysAndCheckEnd() );
+ if ( !skipUnusedKeys() ) {
+ break;
+ }
+ }
+ }
+
+ bool BtreeCursor::skipOutOfRangeKeysAndCheckEnd() {
+ if ( !ok() ) {
+ return false;
+ }
+ int ret = _boundsIterator->advance( currKey() );
+ if ( ret == -2 ) {
+ bucket = DiskLoc();
+ return false;
+ }
+ else if ( ret == -1 ) {
+ ++_nscanned;
+ return false;
+ }
+ ++_nscanned;
+ advanceTo( currKey(), ret, _boundsIterator->after(), _boundsIterator->cmp(), _boundsIterator->inc() );
+ return true;
+ }
+
+ // Return a value in the set {-1, 0, 1} to represent the sign of parameter i.
+ int sgn( int i ) {
+ if ( i == 0 )
+ return 0;
+ return i > 0 ? 1 : -1;
+ }
+
+ // Check if the current key is beyond endKey.
+ void BtreeCursor::checkEnd() {
+ if ( bucket.isNull() )
+ return;
+ if ( !endKey.isEmpty() ) {
+ int cmp = sgn( endKey.woCompare( currKey(), _order ) );
+ if ( ( cmp != 0 && cmp != _direction ) ||
+ ( cmp == 0 && !_endKeyInclusive ) )
+ bucket = DiskLoc();
+ }
+ }
+
+ void BtreeCursor::advanceTo( const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive) {
+ _advanceTo( bucket, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, _ordering, _direction );
+ }
+
+ bool BtreeCursor::advance() {
+ killCurrentOp.checkForInterrupt();
+ if ( bucket.isNull() )
+ return false;
+
+ bucket = _advance(bucket, keyOfs, _direction, "BtreeCursor::advance");
+
+ if ( !_independentFieldRanges ) {
+ skipUnusedKeys();
+ checkEnd();
+ if ( ok() ) {
+ ++_nscanned;
+ }
+ }
+ else {
+ skipAndCheck();
+ }
+ return ok();
+ }
+
+ void BtreeCursor::noteLocation() {
+ if ( !eof() ) {
+ BSONObj o = currKey().getOwned();
+ keyAtKeyOfs = o;
+ locAtKeyOfs = currLoc();
+ }
+ }
+
+ string BtreeCursor::toString() {
+ string s = string("BtreeCursor ") + indexDetails.indexName();
+ if ( _direction < 0 ) s += " reverse";
+ if ( _bounds.get() && _bounds->size() > 1 ) s += " multi";
+ return s;
+ }
+
+ BSONObj BtreeCursor::prettyIndexBounds() const {
+ if ( !_independentFieldRanges ) {
+ return BSON( "start" << prettyKey( startKey ) << "end" << prettyKey( endKey ) );
+ }
+ else {
+ return _bounds->obj();
+ }
+ }
+
+ /* ----------------------------------------------------------------------------- */
+
+ struct BtreeCursorUnitTest {
+ BtreeCursorUnitTest() {
+ assert( minDiskLoc.compare(maxDiskLoc) < 0 );
+ }
+ } btut;
+
+} // namespace mongo
diff --git a/src/mongo/db/cap.cpp b/src/mongo/db/cap.cpp
new file mode 100644
index 00000000000..a8be2383115
--- /dev/null
+++ b/src/mongo/db/cap.cpp
@@ -0,0 +1,457 @@
+// @file cap.cpp capped collection related
+// the "old" version (<= v1.6)
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "pdfile.h"
+#include "db.h"
+#include "../util/mmap.h"
+#include "../util/hashtab.h"
+#include "../scripting/engine.h"
+#include "btree.h"
+#include <algorithm>
+#include <list>
+#include "json.h"
+#include "clientcursor.h"
+
+/*
+ capped collection layout
+
+ d's below won't exist if things align perfectly:
+
+ extent1 -> extent2 -> extent3
+ ------------------- ----------------------- ---------------------
+ d r r r r r r r r d d r r r r d r r r r r d d r r r r r r r r r d
+ ^ ^
+ oldest newest
+
+ ^cappedFirstDeletedInCurExtent()
+ ^cappedLastDelRecLastExtent()
+ ^cappedListOfAllDeletedRecords()
+*/
+
+
+namespace mongo {
+
+ /* combine adjacent deleted records *for the current extent* of the capped collection
+
+ this is O(n^2) but we call it for capped tables where typically n==1 or 2!
+ (or 3...there will be a little unused sliver at the end of the extent.)
+ */
+ void NamespaceDetails::compact() {
+ assert(capped);
+
+ list<DiskLoc> drecs;
+
+ // Pull out capExtent's DRs from deletedList
+ DiskLoc i = cappedFirstDeletedInCurExtent();
+ for (; !i.isNull() && inCapExtent( i ); i = i.drec()->nextDeleted )
+ drecs.push_back( i );
+
+ getDur().writingDiskLoc( cappedFirstDeletedInCurExtent() ) = i;
+
+ // This is the O(n^2) part.
+ drecs.sort();
+
+ list<DiskLoc>::iterator j = drecs.begin();
+ assert( j != drecs.end() );
+ DiskLoc a = *j;
+ while ( 1 ) {
+ j++;
+ if ( j == drecs.end() ) {
+ DEBUGGING out() << "TEMP: compact adddelrec\n";
+ addDeletedRec(a.drec(), a);
+ break;
+ }
+ DiskLoc b = *j;
+ while ( a.a() == b.a() && a.getOfs() + a.drec()->lengthWithHeaders == b.getOfs() ) {
+ // a & b are adjacent. merge.
+ getDur().writingInt( a.drec()->lengthWithHeaders ) += b.drec()->lengthWithHeaders;
+ j++;
+ if ( j == drecs.end() ) {
+ DEBUGGING out() << "temp: compact adddelrec2\n";
+ addDeletedRec(a.drec(), a);
+ return;
+ }
+ b = *j;
+ }
+ DEBUGGING out() << "temp: compact adddelrec3\n";
+ addDeletedRec(a.drec(), a);
+ a = b;
+ }
+ }
+
+ DiskLoc &NamespaceDetails::cappedFirstDeletedInCurExtent() {
+ if ( cappedLastDelRecLastExtent().isNull() )
+ return cappedListOfAllDeletedRecords();
+ else
+ return cappedLastDelRecLastExtent().drec()->nextDeleted;
+ }
+
+ void NamespaceDetails::cappedCheckMigrate() {
+ // migrate old NamespaceDetails format
+ assert( capped );
+ if ( capExtent.a() == 0 && capExtent.getOfs() == 0 ) {
+ //capFirstNewRecord = DiskLoc();
+ capFirstNewRecord.writing().setInvalid();
+ // put all the DeletedRecords in cappedListOfAllDeletedRecords()
+ for ( int i = 1; i < Buckets; ++i ) {
+ DiskLoc first = deletedList[ i ];
+ if ( first.isNull() )
+ continue;
+ DiskLoc last = first;
+ for (; !last.drec()->nextDeleted.isNull(); last = last.drec()->nextDeleted );
+ last.drec()->nextDeleted.writing() = cappedListOfAllDeletedRecords();
+ cappedListOfAllDeletedRecords().writing() = first;
+ deletedList[i].writing() = DiskLoc();
+ }
+ // NOTE cappedLastDelRecLastExtent() set to DiskLoc() in above
+
+ // Last, in case we're killed before getting here
+ capExtent.writing() = firstExtent;
+ }
+ }
+
+ bool NamespaceDetails::inCapExtent( const DiskLoc &dl ) const {
+ assert( !dl.isNull() );
+ // We could have a rec or drec, doesn't matter.
+ bool res = dl.drec()->myExtentLoc(dl) == capExtent;
+ DEV {
+ // old implementation. this check is temp to test works the same. new impl should be a little faster.
+ assert( res == (dl.drec()->myExtent( dl ) == capExtent.ext()) );
+ }
+ return res;
+ }
+
+ bool NamespaceDetails::nextIsInCapExtent( const DiskLoc &dl ) const {
+ assert( !dl.isNull() );
+ DiskLoc next = dl.drec()->nextDeleted;
+ if ( next.isNull() )
+ return false;
+ return inCapExtent( next );
+ }
+
+ void NamespaceDetails::advanceCapExtent( const char *ns ) {
+ // We want cappedLastDelRecLastExtent() to be the last DeletedRecord of the prev cap extent
+ // (or DiskLoc() if new capExtent == firstExtent)
+ if ( capExtent == lastExtent )
+ getDur().writingDiskLoc( cappedLastDelRecLastExtent() ) = DiskLoc();
+ else {
+ DiskLoc i = cappedFirstDeletedInCurExtent();
+ for (; !i.isNull() && nextIsInCapExtent( i ); i = i.drec()->nextDeleted );
+ getDur().writingDiskLoc( cappedLastDelRecLastExtent() ) = i;
+ }
+
+ getDur().writingDiskLoc( capExtent ) = theCapExtent()->xnext.isNull() ? firstExtent : theCapExtent()->xnext;
+
+ /* this isn't true if a collection has been renamed...that is ok just used for diagnostics */
+ //dassert( theCapExtent()->ns == ns );
+
+ theCapExtent()->assertOk();
+ getDur().writingDiskLoc( capFirstNewRecord ) = DiskLoc();
+ }
+
+ DiskLoc NamespaceDetails::__capAlloc( int len ) {
+ DiskLoc prev = cappedLastDelRecLastExtent();
+ DiskLoc i = cappedFirstDeletedInCurExtent();
+ DiskLoc ret;
+ for (; !i.isNull() && inCapExtent( i ); prev = i, i = i.drec()->nextDeleted ) {
+ // We need to keep at least one DR per extent in cappedListOfAllDeletedRecords(),
+ // so make sure there's space to create a DR at the end.
+ if ( i.drec()->lengthWithHeaders >= len + 24 ) {
+ ret = i;
+ break;
+ }
+ }
+
+ /* unlink ourself from the deleted list */
+ if ( !ret.isNull() ) {
+ if ( prev.isNull() )
+ cappedListOfAllDeletedRecords().writing() = ret.drec()->nextDeleted;
+ else
+ prev.drec()->nextDeleted.writing() = ret.drec()->nextDeleted;
+ ret.drec()->nextDeleted.writing().setInvalid(); // defensive.
+ assert( ret.drec()->extentOfs < ret.getOfs() );
+ }
+
+ return ret;
+ }
+
+ DiskLoc NamespaceDetails::cappedAlloc(const char *ns, int len) {
+ // signal done allocating new extents.
+ if ( !cappedLastDelRecLastExtent().isValid() )
+ getDur().writingDiskLoc( cappedLastDelRecLastExtent() ) = DiskLoc();
+
+ assert( len < 400000000 );
+ int passes = 0;
+ int maxPasses = ( len / 30 ) + 2; // 30 is about the smallest entry that could go in the oplog
+ if ( maxPasses < 5000 ) {
+ // this is for bacwards safety since 5000 was the old value
+ maxPasses = 5000;
+ }
+ DiskLoc loc;
+
+ // delete records until we have room and the max # objects limit achieved.
+
+ /* this fails on a rename -- that is ok but must keep commented out */
+ //assert( theCapExtent()->ns == ns );
+
+ theCapExtent()->assertOk();
+ DiskLoc firstEmptyExtent;
+ while ( 1 ) {
+ if ( stats.nrecords < max ) {
+ loc = __capAlloc( len );
+ if ( !loc.isNull() )
+ break;
+ }
+
+ // If on first iteration through extents, don't delete anything.
+ if ( !capFirstNewRecord.isValid() ) {
+ advanceCapExtent( ns );
+
+ if ( capExtent != firstExtent )
+ capFirstNewRecord.writing().setInvalid();
+ // else signal done with first iteration through extents.
+ continue;
+ }
+
+ if ( !capFirstNewRecord.isNull() &&
+ theCapExtent()->firstRecord == capFirstNewRecord ) {
+ // We've deleted all records that were allocated on the previous
+ // iteration through this extent.
+ advanceCapExtent( ns );
+ continue;
+ }
+
+ if ( theCapExtent()->firstRecord.isNull() ) {
+ if ( firstEmptyExtent.isNull() )
+ firstEmptyExtent = capExtent;
+ advanceCapExtent( ns );
+ if ( firstEmptyExtent == capExtent ) {
+ maybeComplain( ns, len );
+ return DiskLoc();
+ }
+ continue;
+ }
+
+ DiskLoc fr = theCapExtent()->firstRecord;
+ theDataFileMgr.deleteRecord(ns, fr.rec(), fr, true); // ZZZZZZZZZZZZ
+ compact();
+ if( ++passes > maxPasses ) {
+ log() << "passes ns:" << ns << " len:" << len << " maxPasses: " << maxPasses << '\n';
+ log() << "passes max:" << max << " nrecords:" << stats.nrecords << " datasize: " << stats.datasize << endl;
+ massert( 10345 , "passes >= maxPasses in capped collection alloc", false );
+ }
+ }
+
+ // Remember first record allocated on this iteration through capExtent.
+ if ( capFirstNewRecord.isValid() && capFirstNewRecord.isNull() )
+ getDur().writingDiskLoc(capFirstNewRecord) = loc;
+
+ return loc;
+ }
+
+ void NamespaceDetails::dumpExtents() {
+ cout << "dumpExtents:" << endl;
+ for ( DiskLoc i = firstExtent; !i.isNull(); i = i.ext()->xnext ) {
+ Extent *e = i.ext();
+ stringstream ss;
+ e->dump(ss);
+ cout << ss.str() << endl;
+ }
+ }
+
+ void NamespaceDetails::cappedDumpDelInfo() {
+ cout << "dl[0]: " << deletedList[0].toString() << endl;
+ for( DiskLoc z = deletedList[0]; !z.isNull(); z = z.drec()->nextDeleted ) {
+ cout << " drec:" << z.toString() << " dreclen:" << hex << z.drec()->lengthWithHeaders <<
+ " ext:" << z.drec()->myExtent(z)->myLoc.toString() << endl;
+ }
+ cout << "dl[1]: " << deletedList[1].toString() << endl;
+ }
+
+ void NamespaceDetails::cappedTruncateLastDelUpdate() {
+ if ( capExtent == firstExtent ) {
+ // Only one extent of the collection is in use, so there
+ // is no deleted record in a previous extent, so nullify
+ // cappedLastDelRecLastExtent().
+ cappedLastDelRecLastExtent().writing() = DiskLoc();
+ }
+ else {
+ // Scan through all deleted records in the collection
+ // until the last deleted record for the extent prior
+ // to the new capExtent is found. Then set
+ // cappedLastDelRecLastExtent() to that deleted record.
+ DiskLoc i = cappedListOfAllDeletedRecords();
+ for( ;
+ !i.drec()->nextDeleted.isNull() &&
+ !inCapExtent( i.drec()->nextDeleted );
+ i = i.drec()->nextDeleted );
+ // In our capped storage model, every extent must have at least one
+ // deleted record. Here we check that 'i' is not the last deleted
+ // record. (We expect that there will be deleted records in the new
+ // capExtent as well.)
+ assert( !i.drec()->nextDeleted.isNull() );
+ cappedLastDelRecLastExtent().writing() = i;
+ }
+ }
+
+ void NamespaceDetails::cappedTruncateAfter(const char *ns, DiskLoc end, bool inclusive) {
+ DEV assert( this == nsdetails(ns) );
+ assert( cappedLastDelRecLastExtent().isValid() );
+
+ // We iteratively remove the newest document until the newest document
+ // is 'end', then we remove 'end' if requested.
+ bool foundLast = false;
+ while( 1 ) {
+ if ( foundLast ) {
+ // 'end' has been found and removed, so break.
+ break;
+ }
+ getDur().commitIfNeeded();
+ // 'curr' will point to the newest document in the collection.
+ DiskLoc curr = theCapExtent()->lastRecord;
+ assert( !curr.isNull() );
+ if ( curr == end ) {
+ if ( inclusive ) {
+ // 'end' has been found, so break next iteration.
+ foundLast = true;
+ }
+ else {
+ // 'end' has been found, so break.
+ break;
+ }
+ }
+
+ // TODO The algorithm used in this function cannot generate an
+ // empty collection, but we could call emptyCappedCollection() in
+ // this case instead of asserting.
+ uassert( 13415, "emptying the collection is not allowed", stats.nrecords > 1 );
+
+ // Delete the newest record, and coalesce the new deleted
+ // record with existing deleted records.
+ theDataFileMgr.deleteRecord(ns, curr.rec(), curr, true);
+ compact();
+
+ // This is the case where we have not yet had to remove any
+ // documents to make room for other documents, and we are allocating
+ // documents from free space in fresh extents instead of reusing
+ // space from familiar extents.
+ if ( !capLooped() ) {
+
+ // We just removed the last record from the 'capExtent', and
+ // the 'capExtent' can't be empty, so we set 'capExtent' to
+ // capExtent's prev extent.
+ if ( theCapExtent()->lastRecord.isNull() ) {
+ assert( !theCapExtent()->xprev.isNull() );
+ // NOTE Because we didn't delete the last document, and
+ // capLooped() is false, capExtent is not the first extent
+ // so xprev will be nonnull.
+ capExtent.writing() = theCapExtent()->xprev;
+ theCapExtent()->assertOk();
+
+ // update cappedLastDelRecLastExtent()
+ cappedTruncateLastDelUpdate();
+ }
+ continue;
+ }
+
+ // This is the case where capLooped() is true, and we just deleted
+ // from capExtent, and we just deleted capFirstNewRecord, which was
+ // the last record on the fresh side of capExtent.
+ // NOTE In this comparison, curr and potentially capFirstNewRecord
+ // may point to invalid data, but we can still compare the
+ // references themselves.
+ if ( curr == capFirstNewRecord ) {
+
+ // Set 'capExtent' to the first nonempty extent prior to the
+ // initial capExtent. There must be such an extent because we
+ // have not deleted the last document in the collection. It is
+ // possible that all extents other than the capExtent are empty.
+ // In this case we will keep the initial capExtent and specify
+ // that all records contained within are on the fresh rather than
+ // stale side of the extent.
+ DiskLoc newCapExtent = capExtent;
+ do {
+ // Find the previous extent, looping if necessary.
+ newCapExtent = ( newCapExtent == firstExtent ) ? lastExtent : newCapExtent.ext()->xprev;
+ newCapExtent.ext()->assertOk();
+ }
+ while ( newCapExtent.ext()->firstRecord.isNull() );
+ capExtent.writing() = newCapExtent;
+
+ // Place all documents in the new capExtent on the fresh side
+ // of the capExtent by setting capFirstNewRecord to the first
+ // document in the new capExtent.
+ capFirstNewRecord.writing() = theCapExtent()->firstRecord;
+
+ // update cappedLastDelRecLastExtent()
+ cappedTruncateLastDelUpdate();
+ }
+ }
+ }
+
+ void NamespaceDetails::emptyCappedCollection( const char *ns ) {
+ DEV assert( this == nsdetails(ns) );
+ massert( 13424, "collection must be capped", capped );
+ massert( 13425, "background index build in progress", !indexBuildInProgress );
+ massert( 13426, "indexes present", nIndexes == 0 );
+
+ // Clear all references to this namespace.
+ ClientCursor::invalidate( ns );
+ NamespaceDetailsTransient::clearForPrefix( ns );
+
+ // Get a writeable reference to 'this' and reset all pertinent
+ // attributes.
+ NamespaceDetails *t = writingWithoutExtra();
+
+ t->cappedLastDelRecLastExtent() = DiskLoc();
+ t->cappedListOfAllDeletedRecords() = DiskLoc();
+
+ // preserve firstExtent/lastExtent
+ t->capExtent = firstExtent;
+ t->stats.datasize = stats.nrecords = 0;
+ // lastExtentSize preserve
+ // nIndexes preserve 0
+ // capped preserve true
+ // max preserve
+ t->paddingFactor = 1.0;
+ t->flags = 0;
+ t->capFirstNewRecord = DiskLoc();
+ t->capFirstNewRecord.setInvalid();
+ t->cappedLastDelRecLastExtent().setInvalid();
+ // dataFileVersion preserve
+ // indexFileVersion preserve
+ t->multiKeyIndexBits = 0;
+ t->reservedA = 0;
+ t->extraOffset = 0;
+ // indexBuildInProgress preserve 0
+ memset(t->reserved, 0, sizeof(t->reserved));
+
+ // Reset all existing extents and recreate the deleted list.
+ for( DiskLoc ext = firstExtent; !ext.isNull(); ext = ext.ext()->xnext ) {
+ DiskLoc prev = ext.ext()->xprev;
+ DiskLoc next = ext.ext()->xnext;
+ DiskLoc empty = ext.ext()->reuse( ns, true );
+ ext.ext()->xprev.writing() = prev;
+ ext.ext()->xnext.writing() = next;
+ addDeletedRec( empty.drec(), empty );
+ }
+ }
+
+}
diff --git a/src/mongo/db/client.cpp b/src/mongo/db/client.cpp
new file mode 100644
index 00000000000..92b78d87ee5
--- /dev/null
+++ b/src/mongo/db/client.cpp
@@ -0,0 +1,697 @@
+// client.cpp
+
+/**
+* Copyright (C) 2009 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* Client represents a connection to the database (the server-side) and corresponds
+ to an open socket (or logical connection if pooling on sockets) from a client.
+*/
+
+#include "pch.h"
+#include "db.h"
+#include "client.h"
+#include "curop-inl.h"
+#include "json.h"
+#include "security.h"
+#include "commands.h"
+#include "instance.h"
+#include "../s/d_logic.h"
+#include "dbwebserver.h"
+#include "../util/mongoutils/html.h"
+#include "../util/mongoutils/checksum.h"
+#include "../util/file_allocator.h"
+#include "repl/rs.h"
+#include "../scripting/engine.h"
+
+namespace mongo {
+
+ Client* Client::syncThread;
+ mongo::mutex Client::clientsMutex("clientsMutex");
+ set<Client*> Client::clients; // always be in clientsMutex when manipulating this
+
+ TSP_DEFINE(Client, currentClient)
+
+#if defined(_DEBUG)
+ struct StackChecker;
+ ThreadLocalValue<StackChecker *> checker;
+
+ struct StackChecker {
+ enum { SZ = 256 * 1024 };
+ char buf[SZ];
+ StackChecker() {
+ checker.set(this);
+ }
+ void init() {
+ memset(buf, 42, sizeof(buf));
+ }
+ static void check(const char *tname) {
+ static int max;
+ StackChecker *sc = checker.get();
+ const char *p = sc->buf;
+ int i = 0;
+ for( ; i < SZ; i++ ) {
+ if( p[i] != 42 )
+ break;
+ }
+ int z = SZ-i;
+ if( z > max ) {
+ max = z;
+ log() << "thread " << tname << " stack usage was " << z << " bytes" << endl;
+ }
+ wassert( i > 16000 );
+ }
+ };
+#endif
+
+ /* each thread which does db operations has a Client object in TLS.
+ call this when your thread starts.
+ */
+#if defined _DEBUG
+ static unsigned long long nThreads = 0;
+ void assertStartingUp() {
+ assert( nThreads <= 1 );
+ }
+#else
+ void assertStartingUp() { }
+#endif
+
+ Client& Client::initThread(const char *desc, AbstractMessagingPort *mp) {
+#if defined(_DEBUG)
+ {
+ nThreads++; // never decremented. this is for casi class asserts
+ if( sizeof(void*) == 8 ) {
+ StackChecker sc;
+ sc.init();
+ }
+ }
+#endif
+ assert( currentClient.get() == 0 );
+ Client *c = new Client(desc, mp);
+ currentClient.reset(c);
+ mongo::lastError.initThread();
+ return *c;
+ }
+
+ Client::Client(const char *desc, AbstractMessagingPort *p) :
+ _context(0),
+ _shutdown(false),
+ _desc(desc),
+ _god(0),
+ _lastOp(0),
+ _mp(p),
+ _sometimes(0)
+ {
+ _hasWrittenThisPass = false;
+ _pageFaultRetryableSection = 0;
+ _connectionId = setThreadName(desc);
+ _curOp = new CurOp( this );
+#ifndef _WIN32
+ stringstream temp;
+ temp << hex << showbase << pthread_self();
+ _threadId = temp.str();
+#endif
+ scoped_lock bl(clientsMutex);
+ clients.insert(this);
+ }
+
+ Client::~Client() {
+ _god = 0;
+
+ if ( _context )
+ error() << "Client::~Client _context should be null but is not; client:" << _desc << endl;
+
+ if ( ! _shutdown ) {
+ error() << "Client::shutdown not called: " << _desc << endl;
+ }
+
+ if ( ! inShutdown() ) {
+ // we can't clean up safely once we're in shutdown
+ scoped_lock bl(clientsMutex);
+ if ( ! _shutdown )
+ clients.erase(this);
+ delete _curOp;
+ }
+ }
+
+ bool Client::shutdown() {
+#if defined(_DEBUG)
+ {
+ if( sizeof(void*) == 8 ) {
+ StackChecker::check( desc() );
+ }
+ }
+#endif
+ _shutdown = true;
+ if ( inShutdown() )
+ return false;
+ {
+ scoped_lock bl(clientsMutex);
+ clients.erase(this);
+ if ( isSyncThread() ) {
+ syncThread = 0;
+ }
+ }
+
+ return false;
+ }
+
+ BSONObj CachedBSONObj::_tooBig = fromjson("{\"$msg\":\"query not recording (too large)\"}");
+ Client::Context::Context( string ns , Database * db, bool doauth ) :
+ _client( currentClient.get() ),
+ _oldContext( _client->_context ),
+ _path( mongo::dbpath ), // is this right? could be a different db? may need a dassert for this
+ _justCreated(false),
+ _ns( ns ),
+ _db(db)
+ {
+ assert( db == 0 || db->isOk() );
+ _client->_context = this;
+ checkNsAccess( doauth );
+ _client->checkLocks();
+ }
+
+ Client::Context::Context(const string& ns, string path , bool doauth ) :
+ _client( currentClient.get() ),
+ _oldContext( _client->_context ),
+ _path( path ),
+ _justCreated(false), // set for real in finishInit
+ _ns( ns ),
+ _db(0)
+ {
+ _finishInit( doauth );
+ _client->checkLocks();
+ }
+
+ /** "read lock, and set my context, all in one operation"
+ * This handles (if not recursively locked) opening an unopened database.
+ */
+ Client::ReadContext::ReadContext(const string& ns, string path, bool doauth ) {
+ {
+ lk.reset( new _LockCollectionForReading(ns) );
+ Database *db = dbHolder().get(ns, path);
+ if( db ) {
+ c.reset( new Context(path, ns, db, doauth) );
+ return;
+ }
+ }
+
+ // we usually don't get here, so doesn't matter how fast this part is
+ {
+ int x = d.dbMutex.getState();
+ if( x > 0 ) {
+ // write locked already
+ DEV RARELY log() << "write locked on ReadContext construction " << ns << endl;
+ c.reset( new Context(ns, path, doauth) );
+ }
+ else if( x == -1 ) {
+ lk.reset(0);
+ {
+ writelock w;
+ Context c(ns, path, doauth);
+ }
+ // db could be closed at this interim point -- that is ok, we will throw, and don't mind throwing.
+ lk.reset( new _LockCollectionForReading(ns) );
+ c.reset( new Context(ns, path, doauth) );
+ }
+ else {
+ assert( x < -1 );
+ uasserted(15928, str::stream() << "can't open a database from a nested read lock " << ns);
+ }
+ }
+
+ // todo: are receipts of thousands of queries for a nonexisting database a potential
+ // cause of bad performance due to the write lock acquisition above? let's fix that.
+ // it would be easy to first check that there is at least a .ns file, or something similar.
+ }
+
+ void Client::Context::checkNotStale() const {
+ switch ( _client->_curOp->getOp() ) {
+ case dbGetMore: // getMore's are special and should be handled else where
+ case dbUpdate: // update & delete check shard version in instance.cpp, so don't check here as well
+ case dbDelete:
+ break;
+ default: {
+ string errmsg;
+ if ( ! shardVersionOk( _ns , errmsg ) ) {
+ ostringstream os;
+ os << "[" << _ns << "] shard version not ok in Client::Context: " << errmsg;
+ throw SendStaleConfigException( _ns, os.str() );
+ }
+ }
+ }
+ }
+
+ // invoked from ReadContext
+ Client::Context::Context(const string& path, const string& ns, Database *db , bool doauth) :
+ _client( currentClient.get() ),
+ _oldContext( _client->_context ),
+ _path( path ),
+ _justCreated(false),
+ _ns( ns ),
+ _db(db)
+ {
+ assert(_db);
+ checkNotStale();
+ _client->_context = this;
+ _client->_curOp->enter( this );
+ checkNsAccess( doauth, d.dbMutex.getState() );
+ _client->checkLocks();
+ }
+
+ void Client::Context::_finishInit( bool doauth ) {
+ int lockState = d.dbMutex.getState();
+ assert( lockState );
+ if ( lockState > 0 && FileAllocator::get()->hasFailed() ) {
+ uassert(14031, "Can't take a write lock while out of disk space", false);
+ }
+
+ _db = dbHolderUnchecked().getOrCreate( _ns , _path , _justCreated );
+ assert(_db);
+ checkNotStale();
+ _client->_context = this;
+ _client->_curOp->enter( this );
+ checkNsAccess( doauth, lockState );
+ }
+
+ void Client::Context::_auth( int lockState ) {
+ if ( _client->_ai.isAuthorizedForLock( _db->name , lockState ) )
+ return;
+
+ // before we assert, do a little cleanup
+ _client->_context = _oldContext; // note: _oldContext may be null
+
+ stringstream ss;
+ ss << "unauthorized db:" << _db->name << " lock type:" << lockState << " client:" << _client->clientAddress();
+ uasserted( 10057 , ss.str() );
+ }
+
+ Client::Context::~Context() {
+ DEV assert( _client == currentClient.get() );
+ _client->_curOp->leave( this );
+ _client->_context = _oldContext; // note: _oldContext may be null
+ }
+
+ bool Client::Context::inDB( const string& db , const string& path ) const {
+ if ( _path != path )
+ return false;
+
+ if ( db == _ns )
+ return true;
+
+ string::size_type idx = _ns.find( db );
+ if ( idx != 0 )
+ return false;
+
+ return _ns[db.size()] == '.';
+ }
+
+ void Client::Context::checkNsAccess( bool doauth, int lockState ) {
+ if ( 0 ) { // SERVER-4276
+ uassert( 15929, "client access to index backing namespace prohibited", NamespaceString::normal( _ns.c_str() ) );
+ }
+ if ( doauth ) {
+ _auth( lockState );
+ }
+ }
+
+ void Client::appendLastOp( BSONObjBuilder& b ) const {
+ // _lastOp is never set if replication is off
+ if( theReplSet || ! _lastOp.isNull() ) {
+ b.appendTimestamp( "lastOp" , _lastOp.asDate() );
+ }
+ }
+
+ string Client::clientAddress(bool includePort) const {
+ if( _curOp )
+ return _curOp->getRemoteString(includePort);
+ return "";
+ }
+
+ string Client::toString() const {
+ stringstream ss;
+ if ( _curOp )
+ ss << _curOp->infoNoauth().jsonString();
+ return ss.str();
+ }
+
+ string sayClientState() {
+ Client* c = currentClient.get();
+ if ( !c )
+ return "no client";
+ return c->toString();
+ }
+
+ Client* curopWaitingForLock( int type ) {
+ Client * c = currentClient.get();
+ assert( c );
+ CurOp * co = c->curop();
+ if ( co ) {
+ co->waitingForLock( type );
+ }
+ return c;
+ }
+ void curopGotLock(Client *c) {
+ assert(c);
+ CurOp * co = c->curop();
+ if ( co )
+ co->gotLock();
+ }
+
+ void KillCurrentOp::interruptJs( AtomicUInt *op ) {
+ if ( !globalScriptEngine )
+ return;
+ if ( !op ) {
+ globalScriptEngine->interruptAll();
+ }
+ else {
+ globalScriptEngine->interrupt( *op );
+ }
+ }
+
+ void KillCurrentOp::killAll() {
+ _globalKill = true;
+ interruptJs( 0 );
+ }
+
+ void KillCurrentOp::kill(AtomicUInt i) {
+ bool found = false;
+ {
+ scoped_lock l( Client::clientsMutex );
+ for( set< Client* >::const_iterator j = Client::clients.begin(); !found && j != Client::clients.end(); ++j ) {
+ for( CurOp *k = ( *j )->curop(); !found && k; k = k->parent() ) {
+ if ( k->opNum() == i ) {
+ k->kill();
+ for( CurOp *l = ( *j )->curop(); l != k; l = l->parent() ) {
+ l->kill();
+ }
+ found = true;
+ }
+ }
+ }
+ }
+ if ( found ) {
+ interruptJs( &i );
+ }
+ }
+
+ void Client::gotHandshake( const BSONObj& o ) {
+ BSONObjIterator i(o);
+
+ {
+ BSONElement id = i.next();
+ assert( id.type() );
+ _remoteId = id.wrap( "_id" );
+ }
+
+ BSONObjBuilder b;
+ while ( i.more() )
+ b.append( i.next() );
+
+ b.appendElementsUnique( _handshake );
+
+ _handshake = b.obj();
+
+ if (theReplSet && o.hasField("member")) {
+ theReplSet->ghost->associateSlave(_remoteId, o["member"].Int());
+ }
+ }
+
+ ClientBasic* ClientBasic::getCurrent() {
+ return currentClient.get();
+ }
+
+ class HandshakeCmd : public Command {
+ public:
+ void help(stringstream& h) const { h << "internal"; }
+ HandshakeCmd() : Command( "handshake" ) {}
+ virtual LockType locktype() const { return NONE; }
+ virtual bool slaveOk() const { return true; }
+ virtual bool adminOnly() const { return false; }
+ virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ Client& c = cc();
+ c.gotHandshake( cmdObj );
+ return 1;
+ }
+
+ } handshakeCmd;
+
+ class ClientListPlugin : public WebStatusPlugin {
+ public:
+ ClientListPlugin() : WebStatusPlugin( "clients" , 20 ) {}
+ virtual void init() {}
+
+ virtual void run( stringstream& ss ) {
+ using namespace mongoutils::html;
+
+ ss << "\n<table border=1 cellpadding=2 cellspacing=0>";
+ ss << "<tr align='left'>"
+ << th( a("", "Connections to the database, both internal and external.", "Client") )
+ << th( a("http://www.mongodb.org/display/DOCS/Viewing+and+Terminating+Current+Operation", "", "OpId") )
+ << "<th>Active</th>"
+ << "<th>LockType</th>"
+ << "<th>Waiting</th>"
+ << "<th>SecsRunning</th>"
+ << "<th>Op</th>"
+ << th( a("http://www.mongodb.org/display/DOCS/Developer+FAQ#DeveloperFAQ-What%27sa%22namespace%22%3F", "", "Namespace") )
+ << "<th>Query</th>"
+ << "<th>client</th>"
+ << "<th>msg</th>"
+ << "<th>progress</th>"
+
+ << "</tr>\n";
+ {
+ scoped_lock bl(Client::clientsMutex);
+ for( set<Client*>::iterator i = Client::clients.begin(); i != Client::clients.end(); i++ ) {
+ Client *c = *i;
+ CurOp& co = *(c->curop());
+ ss << "<tr><td>" << c->desc() << "</td>";
+
+ tablecell( ss , co.opNum() );
+ tablecell( ss , co.active() );
+ {
+ int lt = co.getLockType();
+ if( lt == -1 ) tablecell(ss, "R");
+ else if( lt == 1 ) tablecell(ss, "W");
+ else
+ tablecell( ss , lt);
+ }
+ tablecell( ss , co.isWaitingForLock() );
+ if ( co.active() )
+ tablecell( ss , co.elapsedSeconds() );
+ else
+ tablecell( ss , "" );
+ tablecell( ss , co.getOp() );
+ tablecell( ss , co.getNS() );
+ if ( co.haveQuery() ) {
+ tablecell( ss , co.query() );
+ }
+ else
+ tablecell( ss , "" );
+ tablecell( ss , co.getRemoteString() );
+
+ tablecell( ss , co.getMessage() );
+ tablecell( ss , co.getProgressMeter().toString() );
+
+
+ ss << "</tr>\n";
+ }
+ }
+ ss << "</table>\n";
+
+ }
+
+ } clientListPlugin;
+
+ int Client::recommendedYieldMicros( int * writers , int * readers ) {
+ int num = 0;
+ int w = 0;
+ int r = 0;
+ {
+ scoped_lock bl(clientsMutex);
+ for ( set<Client*>::iterator i=clients.begin(); i!=clients.end(); ++i ) {
+ Client* c = *i;
+ if ( c->curop()->isWaitingForLock() ) {
+ num++;
+ if ( c->curop()->getLockType() > 0 )
+ w++;
+ else
+ r++;
+ }
+ }
+ }
+
+ if ( writers )
+ *writers = w;
+ if ( readers )
+ *readers = r;
+
+ int time = r * 100;
+ time += w * 500;
+
+ time = min( time , 1000000 );
+
+ // if there has been a kill request for this op - we should yield to allow the op to stop
+ // This function returns empty string if we aren't interrupted
+ if ( *killCurrentOp.checkForInterruptNoAssert() ) {
+ return 100;
+ }
+
+ return time;
+ }
+
+ int Client::getActiveClientCount( int& writers, int& readers ) {
+ writers = 0;
+ readers = 0;
+
+ scoped_lock bl(clientsMutex);
+ for ( set<Client*>::iterator i=clients.begin(); i!=clients.end(); ++i ) {
+ Client* c = *i;
+ if ( ! c->curop()->active() )
+ continue;
+
+ int l = c->curop()->getLockType();
+ if ( l > 0 )
+ writers++;
+ else if ( l < 0 )
+ readers++;
+
+ }
+
+ return writers + readers;
+ }
+
+ void OpDebug::reset() {
+ extra.reset();
+
+ op = 0;
+ iscommand = false;
+ ns = "";
+ query = BSONObj();
+ updateobj = BSONObj();
+
+ cursorid = -1;
+ ntoreturn = -1;
+ ntoskip = -1;
+ exhaust = false;
+
+ nscanned = -1;
+ idhack = false;
+ scanAndOrder = false;
+ moved = false;
+ fastmod = false;
+ fastmodinsert = false;
+ upsert = false;
+ keyUpdates = 0; // unsigned, so -1 not possible
+
+ exceptionInfo.reset();
+
+ executionTime = 0;
+ nreturned = -1;
+ responseLength = -1;
+ }
+
+
+#define OPDEBUG_TOSTRING_HELP(x) if( x >= 0 ) s << " " #x ":" << (x)
+#define OPDEBUG_TOSTRING_HELP_BOOL(x) if( x ) s << " " #x ":" << (x)
+ string OpDebug::toString() const {
+ StringBuilder s( ns.size() + 64 );
+ if ( iscommand )
+ s << "command ";
+ else
+ s << opToString( op ) << ' ';
+ s << ns.toString();
+
+ if ( ! query.isEmpty() ) {
+ if ( iscommand )
+ s << " command: ";
+ else
+ s << " query: ";
+ s << query.toString();
+ }
+
+ if ( ! updateobj.isEmpty() ) {
+ s << " update: ";
+ updateobj.toString( s );
+ }
+
+ OPDEBUG_TOSTRING_HELP( cursorid );
+ OPDEBUG_TOSTRING_HELP( ntoreturn );
+ OPDEBUG_TOSTRING_HELP( ntoskip );
+ OPDEBUG_TOSTRING_HELP_BOOL( exhaust );
+
+ OPDEBUG_TOSTRING_HELP( nscanned );
+ OPDEBUG_TOSTRING_HELP_BOOL( idhack );
+ OPDEBUG_TOSTRING_HELP_BOOL( scanAndOrder );
+ OPDEBUG_TOSTRING_HELP_BOOL( moved );
+ OPDEBUG_TOSTRING_HELP_BOOL( fastmod );
+ OPDEBUG_TOSTRING_HELP_BOOL( fastmodinsert );
+ OPDEBUG_TOSTRING_HELP_BOOL( upsert );
+ OPDEBUG_TOSTRING_HELP( keyUpdates );
+
+ if ( extra.len() )
+ s << " " << extra.str();
+
+ if ( ! exceptionInfo.empty() ) {
+ s << " exception: " << exceptionInfo.msg;
+ if ( exceptionInfo.code )
+ s << " code:" << exceptionInfo.code;
+ }
+
+ OPDEBUG_TOSTRING_HELP( nreturned );
+ if ( responseLength )
+ s << " reslen:" << responseLength;
+ s << " " << executionTime << "ms";
+
+ return s.str();
+ }
+
+#define OPDEBUG_APPEND_NUMBER(x) if( x != -1 ) b.append( #x , (x) )
+#define OPDEBUG_APPEND_BOOL(x) if( x ) b.appendBool( #x , (x) )
+ void OpDebug::append( const CurOp& curop, BSONObjBuilder& b ) const {
+ b.append( "op" , iscommand ? "command" : opToString( op ) );
+ b.append( "ns" , ns.toString() );
+ if ( ! query.isEmpty() )
+ b.append( iscommand ? "command" : "query" , query );
+ else if ( ! iscommand && curop.haveQuery() )
+ curop.appendQuery( b , "query" );
+
+ if ( ! updateobj.isEmpty() )
+ b.append( "updateobj" , updateobj );
+
+ OPDEBUG_APPEND_NUMBER( cursorid );
+ OPDEBUG_APPEND_NUMBER( ntoreturn );
+ OPDEBUG_APPEND_NUMBER( ntoskip );
+ OPDEBUG_APPEND_BOOL( exhaust );
+
+ OPDEBUG_APPEND_NUMBER( nscanned );
+ OPDEBUG_APPEND_BOOL( idhack );
+ OPDEBUG_APPEND_BOOL( scanAndOrder );
+ OPDEBUG_APPEND_BOOL( moved );
+ OPDEBUG_APPEND_BOOL( fastmod );
+ OPDEBUG_APPEND_BOOL( fastmodinsert );
+ OPDEBUG_APPEND_BOOL( upsert );
+ OPDEBUG_APPEND_NUMBER( keyUpdates );
+
+ if ( ! exceptionInfo.empty() )
+ exceptionInfo.append( b , "exception" , "exceptionCode" );
+
+ OPDEBUG_APPEND_NUMBER( nreturned );
+ OPDEBUG_APPEND_NUMBER( responseLength );
+ b.append( "millis" , executionTime );
+
+ }
+
+}
diff --git a/src/mongo/db/client.h b/src/mongo/db/client.h
new file mode 100644
index 00000000000..6aa8bc00f02
--- /dev/null
+++ b/src/mongo/db/client.h
@@ -0,0 +1,286 @@
+/* @file db/client.h
+
+ "Client" represents a connection to the database (the server-side) and corresponds
+ to an open socket (or logical connection if pooling on sockets) from a client.
+
+ todo: switch to asio...this will fit nicely with that.
+*/
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "security.h"
+#include "namespace-inl.h"
+#include "lasterror.h"
+#include "stats/top.h"
+#include "../db/client_common.h"
+#include "../util/concurrency/threadlocal.h"
+#include "../util/net/message_port.h"
+#include "../util/concurrency/rwlock.h"
+#include "d_concurrency.h"
+
+namespace mongo {
+
+ extern class ReplSet *theReplSet;
+ class AuthenticationInfo;
+ class Database;
+ class CurOp;
+ class Command;
+ class Client;
+ class AbstractMessagingPort;
+ class LockCollectionForReading;
+ class PageFaultRetryableSection;
+
+#if defined(CLC)
+ typedef LockCollectionForReading _LockCollectionForReading;
+#else
+ typedef readlock _LockCollectionForReading;
+#endif
+
+ TSP_DECLARE(Client, currentClient)
+
+ typedef long long ConnectionId;
+
+ /** the database's concept of an outside "client" */
+ class Client : public ClientBasic {
+ static Client *syncThread;
+ public:
+ // always be in clientsMutex when manipulating this. killop stuff uses these.
+ static set<Client*> clients;
+ static mongo::mutex clientsMutex;
+ static int getActiveClientCount( int& writers , int& readers );
+ class Context;
+ ~Client();
+ static int recommendedYieldMicros( int * writers = 0 , int * readers = 0 );
+
+ /** each thread which does db operations has a Client object in TLS.
+ * call this when your thread starts.
+ */
+ static Client& initThread(const char *desc, AbstractMessagingPort *mp = 0);
+
+ static void initThreadIfNotAlready(const char *desc) {
+ if( currentClient.get() )
+ return;
+ initThread(desc);
+ }
+
+ /** this has to be called as the client goes away, but before thread termination
+ * @return true if anything was done
+ */
+ bool shutdown();
+
+ /** set so isSyncThread() works */
+ void iAmSyncThread() {
+ wassert( syncThread == 0 );
+ syncThread = this;
+ }
+ /** @return true if this client is the replication secondary pull thread. not used much, is used in create index sync code. */
+ bool isSyncThread() const { return this == syncThread; }
+
+ string clientAddress(bool includePort=false) const;
+ const AuthenticationInfo * getAuthenticationInfo() const { return &_ai; }
+ AuthenticationInfo * getAuthenticationInfo() { return &_ai; }
+ bool isAdmin() { return _ai.isAuthorized( "admin" ); }
+ CurOp* curop() const { return _curOp; }
+ Context* getContext() const { return _context; }
+ Database* database() const { return _context ? _context->db() : 0; }
+ const char *ns() const { return _context->ns(); }
+ const char *desc() const { return _desc; }
+ void setLastOp( OpTime op ) { _lastOp = op; }
+ OpTime getLastOp() const { return _lastOp; }
+
+ /** caution -- use Context class instead */
+ void setContext(Context *c) { _context = c; }
+
+ /* report what the last operation was. used by getlasterror */
+ void appendLastOp( BSONObjBuilder& b ) const;
+
+ bool isGod() const { return _god; } /* this is for map/reduce writes */
+ string toString() const;
+ void gotHandshake( const BSONObj& o );
+ bool hasRemote() const { return _mp; }
+ HostAndPort getRemote() const { assert( _mp ); return _mp->remote(); }
+ BSONObj getRemoteID() const { return _remoteId; }
+ BSONObj getHandshake() const { return _handshake; }
+ AbstractMessagingPort * port() const { return _mp; }
+ ConnectionId getConnectionId() const { return _connectionId; }
+ private:
+ Client(const char *desc, AbstractMessagingPort *p = 0);
+ friend class CurOp;
+ ConnectionId _connectionId; // > 0 for things "conn", 0 otherwise
+ string _threadId; // "" on non support systems
+ CurOp * _curOp;
+ Context * _context;
+ bool _shutdown; // to track if Client::shutdown() gets called
+ const char * const _desc;
+ bool _god;
+ AuthenticationInfo _ai;
+ OpTime _lastOp;
+ BSONObj _handshake;
+ BSONObj _remoteId;
+ AbstractMessagingPort * const _mp;
+ unsigned _sometimes;
+ public:
+ bool _hasWrittenThisPass;
+ PageFaultRetryableSection *_pageFaultRetryableSection;
+
+ /** the concept here is the same as MONGO_SOMETIMES. however that
+ macro uses a static that will be shared by all threads, and each
+ time incremented it might eject that line from the other cpu caches (?),
+ so idea is that this is better.
+ */
+ bool sometimes(unsigned howOften) { return ++_sometimes % howOften == 0; }
+
+ /* set _god=true temporarily, safely */
+ class GodScope {
+ bool _prev;
+ public:
+ GodScope();
+ ~GodScope();
+ };
+
+ //static void assureDatabaseIsOpen(const string& ns, string path=dbpath);
+
+ /** "read lock, and set my context, all in one operation"
+ * This handles (if not recursively locked) opening an unopened database.
+ */
+ class ReadContext : boost::noncopyable {
+ public:
+ ReadContext(const string& ns, string path=dbpath, bool doauth=true );
+ Context& ctx() { return *c.get(); }
+ private:
+ scoped_ptr<_LockCollectionForReading> lk;
+ scoped_ptr<Context> c;
+ };
+
+ /* Set database we want to use, then, restores when we finish (are out of scope)
+ Note this is also helpful if an exception happens as the state if fixed up.
+ */
+ class Context : boost::noncopyable {
+ public:
+ /** this is probably what you want */
+ Context(const string& ns, string path=dbpath, bool doauth=true );
+
+ /** note: this does not call finishInit -- i.e., does not call
+ shardVersionOk() for example.
+ see also: reset().
+ */
+ Context( string ns , Database * db, bool doauth=true );
+
+ // used by ReadContext
+ Context(const string& path, const string& ns, Database *db, bool doauth);
+
+ ~Context();
+ Client* getClient() const { return _client; }
+ Database* db() const { return _db; }
+ const char * ns() const { return _ns.c_str(); }
+ bool equals( const string& ns , const string& path=dbpath ) const { return _ns == ns && _path == path; }
+
+ /** @return if the db was created by this Context */
+ bool justCreated() const { return _justCreated; }
+
+ /** @return true iff the current Context is using db/path */
+ bool inDB( const string& db , const string& path=dbpath ) const;
+
+ void _clear() { // this is sort of an "early destruct" indication, _ns can never be uncleared
+ const_cast<string&>(_ns).empty();
+ _db = 0;
+ }
+
+ /** call before unlocking, so clear any non-thread safe state
+ * _db gets restored on the relock
+ */
+ void unlocked() { _db = 0; }
+
+ /** call after going back into the lock, will re-establish non-thread safe stuff */
+ void relocked() { _finishInit(); }
+
+ private:
+ friend class CurOp;
+ void _finishInit( bool doauth=true);
+ void _auth( int lockState );
+ void checkNotStale() const;
+ void checkNsAccess( bool doauth, int lockState = d.dbMutex.getState() );
+ Client * const _client;
+ Context * const _oldContext;
+ const string _path;
+ bool _justCreated;
+ const string _ns;
+ Database * _db;
+ }; // class Client::Context
+
+ struct LockStatus {
+ LockStatus();
+ string whichCollection;
+ unsigned excluder, global, collection;
+ string toString() const;
+ } lockStatus;
+
+#if defined(CLC)
+ void checkLocks() const;
+#else
+ void checkLocks() const { }
+#endif
+
+ }; // class Client
+
+ /** get the Client object for this thread. */
+ inline Client& cc() {
+ Client * c = currentClient.get();
+ assert( c );
+ return *c;
+ }
+
+ inline Client::GodScope::GodScope() {
+ _prev = cc()._god;
+ cc()._god = true;
+ }
+ inline Client::GodScope::~GodScope() { cc()._god = _prev; }
+
+ /* this unreadlocks and then writelocks; i.e. it does NOT upgrade inside the
+ lock (and is thus wrong to use if you need that, which is usually).
+ that said we use it today for a specific case where the usage is correct.
+ */
+#if 0
+ inline void mongolock::releaseAndWriteLock() {
+ if( !_writelock ) {
+
+#if BOOST_VERSION >= 103500
+ int s = d.dbMutex.getState();
+ if( s != -1 ) {
+ log() << "error: releaseAndWriteLock() s == " << s << endl;
+ msgasserted( 12600, "releaseAndWriteLock: unlock_shared failed, probably recursive" );
+ }
+#endif
+
+ _writelock = true;
+ d.dbMutex.unlock_shared();
+ d.dbMutex.lock();
+
+ // todo: unlocked() method says to call it before unlocking, not after. so fix this here,
+ // or fix the doc there.
+ if ( cc().getContext() )
+ cc().getContext()->unlocked();
+ }
+ }
+#endif
+
+ inline bool haveClient() { return currentClient.get() > 0; }
+
+};
diff --git a/src/mongo/db/client_common.h b/src/mongo/db/client_common.h
new file mode 100644
index 00000000000..eb70105ef99
--- /dev/null
+++ b/src/mongo/db/client_common.h
@@ -0,0 +1,47 @@
+// client_common.h
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+//#include "../pch.h"
+//#include "security.h"
+#include "../util/net/hostandport.h"
+
+namespace mongo {
+
+ class AuthenticationInfo;
+
+ /**
+ * this is the base class for Client and ClientInfo
+ * Client is for mongod
+ * Client is for mongos
+ * They should converge slowly
+ * The idea is this has the basic api so that not all code has to be duplicated
+ */
+ class ClientBasic : boost::noncopyable {
+ public:
+ virtual ~ClientBasic(){}
+ virtual const AuthenticationInfo * getAuthenticationInfo() const = 0;
+ virtual AuthenticationInfo * getAuthenticationInfo() = 0;
+
+ virtual bool hasRemote() const = 0;
+ virtual HostAndPort getRemote() const = 0;
+
+ static ClientBasic* getCurrent();
+ };
+}
diff --git a/src/mongo/db/clientcursor.cpp b/src/mongo/db/clientcursor.cpp
new file mode 100644
index 00000000000..dc04ec38f63
--- /dev/null
+++ b/src/mongo/db/clientcursor.cpp
@@ -0,0 +1,747 @@
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* clientcursor.cpp
+
+ ClientCursor is a wrapper that represents a cursorid from our database
+ application's perspective.
+
+ Cursor -- and its derived classes -- are our internal cursors.
+*/
+
+#include "pch.h"
+#include "clientcursor.h"
+#include "introspect.h"
+#include <time.h>
+#include "db.h"
+#include "commands.h"
+#include "repl_block.h"
+#include "../util/processinfo.h"
+#include "../util/timer.h"
+#include "../server.h"
+
+namespace mongo {
+
+ CCById ClientCursor::clientCursorsById;
+ boost::recursive_mutex& ClientCursor::ccmutex( *(new boost::recursive_mutex()) );
+ long long ClientCursor::numberTimedOut = 0;
+
+ void aboutToDeleteForSharding( const Database* db , const DiskLoc& dl ); // from s/d_logic.h
+
+ /*static*/ void ClientCursor::assertNoCursors() {
+ recursive_scoped_lock lock(ccmutex);
+ if( clientCursorsById.size() ) {
+ log() << "ERROR clientcursors exist but should not at this point" << endl;
+ ClientCursor *cc = clientCursorsById.begin()->second;
+ log() << "first one: " << cc->_cursorid << ' ' << cc->_ns << endl;
+ clientCursorsById.clear();
+ assert(false);
+ }
+ }
+
+
+ void ClientCursor::setLastLoc_inlock(DiskLoc L) {
+ assert( _pos != -2 ); // defensive - see ~ClientCursor
+
+ if ( L == _lastLoc )
+ return;
+
+ CCByLoc& bl = byLoc();
+
+ if ( !_lastLoc.isNull() ) {
+ bl.erase( ByLocKey( _lastLoc, _cursorid ) );
+ }
+
+ if ( !L.isNull() )
+ bl[ByLocKey(L,_cursorid)] = this;
+ _lastLoc = L;
+ }
+
+ /* ------------------------------------------- */
+
+ /* must call this when a btree node is updated */
+ //void removedKey(const DiskLoc& btreeLoc, int keyPos) {
+ //}
+
+ // ns is either a full namespace or "dbname." when invalidating for a whole db
+ void ClientCursor::invalidate(const char *ns) {
+ d.dbMutex.assertWriteLocked();
+ int len = strlen(ns);
+ const char* dot = strchr(ns, '.');
+ assert( len > 0 && dot);
+
+ bool isDB = (dot == &ns[len-1]); // first (and only) dot is the last char
+
+ {
+ //cout << "\nTEMP invalidate " << ns << endl;
+ recursive_scoped_lock lock(ccmutex);
+
+ Database *db = cc().database();
+ assert(db);
+ assert( str::startsWith(ns, db->name) );
+
+ for( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end(); /*++i*/ ) {
+ ClientCursor *cc = i->second;
+
+ ++i; // we may be removing this node
+
+ if( cc->_db != db )
+ continue;
+
+ if (isDB) {
+ // already checked that db matched above
+ dassert( str::startsWith(cc->_ns.c_str(), ns) );
+ delete cc; //removes self from ccByID
+ }
+ else {
+ if ( str::equals(cc->_ns.c_str(), ns) )
+ delete cc; //removes self from ccByID
+ }
+ }
+
+ /*
+ note : we can't iterate byloc because clientcursors may exist with a loc of null in which case
+ they are not in the map. perhaps they should not exist though in the future? something to
+ change???
+
+ CCByLoc& bl = db->ccByLoc;
+ for ( CCByLoc::iterator i = bl.begin(); i != bl.end(); ++i ) {
+ ClientCursor *cc = i->second;
+ if ( strncmp(ns, cc->ns.c_str(), len) == 0 ) {
+ assert( cc->_db == db );
+ toDelete.push_back(i->second);
+ }
+ }*/
+
+ /*cout << "TEMP after invalidate " << endl;
+ for( auto i = clientCursorsById.begin(); i != clientCursorsById.end(); ++i ) {
+ cout << " " << i->second->ns << endl;
+ }
+ cout << "TEMP after invalidate done" << endl;*/
+ }
+ }
+
+ /* note called outside of locks (other than ccmutex) so care must be exercised */
+ bool ClientCursor::shouldTimeout( unsigned millis ) {
+ _idleAgeMillis += millis;
+ return _idleAgeMillis > 600000 && _pinValue == 0;
+ }
+
+ /* called every 4 seconds. millis is amount of idle time passed since the last call -- could be zero */
+ void ClientCursor::idleTimeReport(unsigned millis) {
+ bool foundSomeToTimeout = false;
+
+ // two passes so that we don't need to readlock unless we really do some timeouts
+ // we assume here that incrementing _idleAgeMillis outside readlock is ok.
+ {
+ recursive_scoped_lock lock(ccmutex);
+ {
+ unsigned sz = clientCursorsById.size();
+ static time_t last;
+ if( sz >= 100000 ) {
+ if( time(0) - last > 300 ) {
+ last = time(0);
+ log() << "warning number of open cursors is very large: " << sz << endl;
+ }
+ }
+ }
+ for ( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end(); ) {
+ CCById::iterator j = i;
+ i++;
+ if( j->second->shouldTimeout( millis ) ) {
+ foundSomeToTimeout = true;
+ break;
+ }
+ }
+ }
+
+ if( foundSomeToTimeout ) {
+ // todo: ideally all readlocks automatically note what we are locking for so this
+ // can be reported in currentop command. e.g. something like:
+ // readlock lk("", "timeout cursors");
+ readlock lk("");
+ recursive_scoped_lock lock(ccmutex);
+ for ( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end(); ) {
+ CCById::iterator j = i;
+ i++;
+ if( j->second->shouldTimeout(0) ) {
+ numberTimedOut++;
+ LOG(1) << "killing old cursor " << j->second->_cursorid << ' ' << j->second->_ns
+ << " idle:" << j->second->idleTime() << "ms\n";
+ delete j->second;
+ }
+ }
+ }
+ }
+
+ /* must call when a btree bucket going away.
+ note this is potentially slow
+ */
+ void ClientCursor::informAboutToDeleteBucket(const DiskLoc& b) {
+ recursive_scoped_lock lock(ccmutex);
+ Database *db = cc().database();
+ CCByLoc& bl = db->ccByLoc;
+ RARELY if ( bl.size() > 70 ) {
+ log() << "perf warning: byLoc.size=" << bl.size() << " in aboutToDeleteBucket\n";
+ }
+ if( bl.size() == 0 ) {
+ DEV tlog() << "debug warning: no cursors found in informAboutToDeleteBucket()" << endl;
+ }
+ for ( CCByLoc::iterator i = bl.begin(); i != bl.end(); i++ )
+ i->second->_c->aboutToDeleteBucket(b);
+ }
+ void aboutToDeleteBucket(const DiskLoc& b) {
+ ClientCursor::informAboutToDeleteBucket(b);
+ }
+
+ /* must call this on a delete so we clean up the cursors. */
+ void ClientCursor::aboutToDelete(const DiskLoc& dl) {
+ recursive_scoped_lock lock(ccmutex);
+
+ Database *db = cc().database();
+ assert(db);
+
+ aboutToDeleteForSharding( db , dl );
+
+ CCByLoc& bl = db->ccByLoc;
+ CCByLoc::iterator j = bl.lower_bound(ByLocKey::min(dl));
+ CCByLoc::iterator stop = bl.upper_bound(ByLocKey::max(dl));
+ if ( j == stop )
+ return;
+
+ vector<ClientCursor*> toAdvance;
+
+ while ( 1 ) {
+ toAdvance.push_back(j->second);
+ DEV assert( j->first.loc == dl );
+ ++j;
+ if ( j == stop )
+ break;
+ }
+
+ if( toAdvance.size() >= 3000 ) {
+ log() << "perf warning MPW101: " << toAdvance.size() << " cursors for one diskloc "
+ << dl.toString()
+ << ' ' << toAdvance[1000]->_ns
+ << ' ' << toAdvance[2000]->_ns
+ << ' ' << toAdvance[1000]->_pinValue
+ << ' ' << toAdvance[2000]->_pinValue
+ << ' ' << toAdvance[1000]->_pos
+ << ' ' << toAdvance[2000]->_pos
+ << ' ' << toAdvance[1000]->_idleAgeMillis
+ << ' ' << toAdvance[2000]->_idleAgeMillis
+ << ' ' << toAdvance[1000]->_doingDeletes
+ << ' ' << toAdvance[2000]->_doingDeletes
+ << endl;
+ //wassert( toAdvance.size() < 5000 );
+ }
+
+ for ( vector<ClientCursor*>::iterator i = toAdvance.begin(); i != toAdvance.end(); ++i ) {
+ ClientCursor* cc = *i;
+ wassert(cc->_db == db);
+
+ if ( cc->_doingDeletes ) continue;
+
+ Cursor *c = cc->_c.get();
+ if ( c->capped() ) {
+ /* note we cannot advance here. if this condition occurs, writes to the oplog
+ have "caught" the reader. skipping ahead, the reader would miss postentially
+ important data.
+ */
+ delete cc;
+ continue;
+ }
+
+ c->checkLocation();
+ DiskLoc tmp1 = c->refLoc();
+ if ( tmp1 != dl ) {
+ // This might indicate a failure to call ClientCursor::updateLocation() but it can
+ // also happen during correct operation, see SERVER-2009.
+ problem() << "warning: cursor loc " << tmp1 << " does not match byLoc position " << dl << " !" << endl;
+ }
+ else {
+ c->advance();
+ }
+ while (!c->eof() && c->refLoc() == dl) {
+ /* We don't delete at EOF because we want to return "no more results" rather than "no such cursor".
+ * The loop is to handle MultiKey indexes where the deleted record is pointed to by multiple adjacent keys.
+ * In that case we need to advance until we get to the next distinct record or EOF.
+ * SERVER-4154
+ */
+ c->advance();
+ }
+ cc->updateLocation();
+ }
+ }
+ void aboutToDelete(const DiskLoc& dl) { ClientCursor::aboutToDelete(dl); }
+
+ ClientCursor::ClientCursor(int queryOptions, const shared_ptr<Cursor>& c, const string& ns, BSONObj query ) :
+ _ns(ns), _db( cc().database() ),
+ _c(c), _pos(0),
+ _query(query), _queryOptions(queryOptions),
+ _idleAgeMillis(0), _pinValue(0),
+ _doingDeletes(false), _yieldSometimesTracker(128,10) {
+
+ d.dbMutex.assertAtLeastReadLocked();
+
+ assert( _db );
+ assert( str::startsWith(_ns, _db->name) );
+ if( queryOptions & QueryOption_NoCursorTimeout )
+ noTimeout();
+ recursive_scoped_lock lock(ccmutex);
+ _cursorid = allocCursorId_inlock();
+ clientCursorsById.insert( make_pair(_cursorid, this) );
+
+ if ( ! _c->modifiedKeys() ) {
+ // store index information so we can decide if we can
+ // get something out of the index key rather than full object
+
+ int x = 0;
+ BSONObjIterator i( _c->indexKeyPattern() );
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ if ( e.isNumber() ) {
+ // only want basic index fields, not "2d" etc
+ _indexedFields[e.fieldName()] = x;
+ }
+ x++;
+ }
+ }
+
+ }
+
+
+ ClientCursor::~ClientCursor() {
+ if( _pos == -2 ) {
+ // defensive: destructor called twice
+ wassert(false);
+ return;
+ }
+
+ {
+ recursive_scoped_lock lock(ccmutex);
+ setLastLoc_inlock( DiskLoc() ); // removes us from bylocation multimap
+ clientCursorsById.erase(_cursorid);
+
+ // defensive:
+ (CursorId&)_cursorid = -1;
+ _pos = -2;
+ }
+ }
+
+ bool ClientCursor::getFieldsDotted( const string& name, BSONElementSet &ret, BSONObj& holder ) {
+
+ map<string,int>::const_iterator i = _indexedFields.find( name );
+ if ( i == _indexedFields.end() ) {
+ current().getFieldsDotted( name , ret );
+ return false;
+ }
+
+ int x = i->second;
+
+ holder = currKey();
+ BSONObjIterator it( holder );
+ while ( x && it.more() ) {
+ it.next();
+ x--;
+ }
+ assert( x == 0 );
+ ret.insert( it.next() );
+ return true;
+ }
+
+ BSONElement ClientCursor::getFieldDotted( const string& name , BSONObj& holder , bool * fromKey ) {
+
+ map<string,int>::const_iterator i = _indexedFields.find( name );
+ if ( i == _indexedFields.end() ) {
+ if ( fromKey )
+ *fromKey = false;
+ holder = current();
+ return holder.getFieldDotted( name );
+ }
+
+ int x = i->second;
+
+ holder = currKey();
+ BSONObjIterator it( holder );
+ while ( x && it.more() ) {
+ it.next();
+ x--;
+ }
+ assert( x == 0 );
+
+ if ( fromKey )
+ *fromKey = true;
+ return it.next();
+ }
+
+ BSONObj ClientCursor::extractFields(const BSONObj &pattern , bool fillWithNull ) {
+ BSONObjBuilder b( pattern.objsize() * 2 );
+
+ BSONObj holder;
+
+ BSONObjIterator i( pattern );
+ while ( i.more() ) {
+ BSONElement key = i.next();
+ BSONElement value = getFieldDotted( key.fieldName() , holder );
+
+ if ( value.type() ) {
+ b.appendAs( value , key.fieldName() );
+ continue;
+ }
+
+ if ( fillWithNull )
+ b.appendNull( key.fieldName() );
+
+ }
+
+ return b.obj();
+ }
+
+
+ /* call when cursor's location changes so that we can update the
+ cursorsbylocation map. if you are locked and internally iterating, only
+ need to call when you are ready to "unlock".
+ */
+ void ClientCursor::updateLocation() {
+ assert( _cursorid );
+ _idleAgeMillis = 0;
+ DiskLoc cl = _c->refLoc();
+ if ( lastLoc() == cl ) {
+ //log() << "info: lastloc==curloc " << ns << '\n';
+ }
+ else {
+ recursive_scoped_lock lock(ccmutex);
+ setLastLoc_inlock(cl);
+ }
+ // may be necessary for MultiCursor even when cl hasn't changed
+ _c->noteLocation();
+ }
+
+ int ClientCursor::suggestYieldMicros() {
+ int writers = 0;
+ int readers = 0;
+
+ int micros = Client::recommendedYieldMicros( &writers , &readers );
+
+ if ( micros > 0 && writers == 0 && d.dbMutex.getState() <= 0 ) {
+ // we have a read lock, and only reads are coming on, so why bother unlocking
+ return 0;
+ }
+
+ wassert( micros < 10000000 );
+ dassert( micros < 1000001 );
+ return micros;
+ }
+
+ Record* ClientCursor::_recordForYield( ClientCursor::RecordNeeds need ) {
+ if ( need == DontNeed ) {
+ return 0;
+ }
+ else if ( need == MaybeCovered ) {
+ // TODO
+ return 0;
+ }
+ else if ( need == WillNeed ) {
+ // no-op
+ }
+ else {
+ warning() << "don't understand RecordNeeds: " << (int)need << endl;
+ return 0;
+ }
+
+ DiskLoc l = currLoc();
+ if ( l.isNull() )
+ return 0;
+
+ Record * rec = l.rec();
+ if ( rec->likelyInPhysicalMemory() )
+ return 0;
+
+ return rec;
+ }
+
+ bool ClientCursor::yieldSometimes( RecordNeeds need, bool *yielded ) {
+ if ( yielded ) {
+ *yielded = false;
+ }
+ if ( ! _yieldSometimesTracker.intervalHasElapsed() ) {
+ Record* rec = _recordForYield( need );
+ if ( rec ) {
+ // yield for page fault
+ if ( yielded ) {
+ *yielded = true;
+ }
+ return yield( suggestYieldMicros() , rec );
+ }
+ return true;
+ }
+
+ int micros = suggestYieldMicros();
+ if ( micros > 0 ) {
+ if ( yielded ) {
+ *yielded = true;
+ }
+ return yield( micros , _recordForYield( need ) );
+ }
+ return true;
+ }
+
+ void ClientCursor::staticYield( int micros , const StringData& ns , Record * rec ) {
+ killCurrentOp.checkForInterrupt( false );
+ {
+ auto_ptr<LockMongoFilesShared> lk;
+ if ( rec ) {
+ // need to lock this else rec->touch won't be safe file could disappear
+ lk.reset( new LockMongoFilesShared() );
+ }
+
+ dbtempreleasecond unlock;
+ if ( unlock.unlocked() ) {
+ if ( micros == -1 )
+ micros = Client::recommendedYieldMicros();
+ if ( micros > 0 )
+ sleepmicros( micros );
+ }
+ else {
+ CurOp * c = cc().curop();
+ while ( c->parent() )
+ c = c->parent();
+ LOGSOME << "warning ClientCursor::yield can't unlock b/c of recursive lock"
+ << " ns: " << ns
+ << " top: " << c->info()
+ << endl;
+ }
+
+ if ( rec )
+ rec->touch();
+
+ lk.reset(0); // need to release this before dbtempreleasecond
+ }
+ }
+
+ bool ClientCursor::prepareToYield( YieldData &data ) {
+ if ( ! _c->supportYields() )
+ return false;
+ if ( ! _c->prepareToYield() ) {
+ return false;
+ }
+ // need to store in case 'this' gets deleted
+ data._id = _cursorid;
+
+ data._doingDeletes = _doingDeletes;
+ _doingDeletes = false;
+
+ updateLocation();
+
+ {
+ /* a quick test that our temprelease is safe.
+ todo: make a YieldingCursor class
+ and then make the following code part of a unit test.
+ */
+ const int test = 0;
+ static bool inEmpty = false;
+ if( test && !inEmpty ) {
+ inEmpty = true;
+ log() << "TEST: manipulate collection during cc:yield" << endl;
+ if( test == 1 )
+ Helpers::emptyCollection(_ns.c_str());
+ else if( test == 2 ) {
+ BSONObjBuilder b; string m;
+ dropCollection(_ns.c_str(), m, b);
+ }
+ else {
+ dropDatabase(_ns.c_str());
+ }
+ }
+ }
+ return true;
+ }
+
+ bool ClientCursor::recoverFromYield( const YieldData &data ) {
+ ClientCursor *cc = ClientCursor::find( data._id , false );
+ if ( cc == 0 ) {
+ // id was deleted
+ return false;
+ }
+
+ cc->_doingDeletes = data._doingDeletes;
+ cc->_c->recoverFromYield();
+ return true;
+ }
+
+ /** @return true if cursor is still ok */
+ bool ClientCursor::yield( int micros , Record * recordToLoad ) {
+
+ if ( ! _c->supportYields() ) // so me cursors (geo@oct2011) don't support yielding
+ return true;
+
+ YieldData data;
+ prepareToYield( data );
+ staticYield( micros , _ns , recordToLoad );
+ return ClientCursor::recoverFromYield( data );
+ }
+
+ long long ctmLast = 0; // so we don't have to do find() which is a little slow very often.
+ long long ClientCursor::allocCursorId_inlock() {
+ long long ctm = curTimeMillis64();
+ dassert( ctm );
+ long long x;
+ while ( 1 ) {
+ x = (((long long)rand()) << 32);
+ x = x ^ ctm;
+ if ( ctm != ctmLast || ClientCursor::find_inlock(x, false) == 0 )
+ break;
+ }
+ ctmLast = ctm;
+ return x;
+ }
+
+ void ClientCursor::storeOpForSlave( DiskLoc last ) {
+ if ( ! ( _queryOptions & QueryOption_OplogReplay ))
+ return;
+
+ if ( last.isNull() )
+ return;
+
+ BSONElement e = last.obj()["ts"];
+ if ( e.type() == Date || e.type() == Timestamp )
+ _slaveReadTill = e._opTime();
+ }
+
+ void ClientCursor::updateSlaveLocation( CurOp& curop ) {
+ if ( _slaveReadTill.isNull() )
+ return;
+ mongo::updateSlaveLocation( curop , _ns.c_str() , _slaveReadTill );
+ }
+
+
+ void ClientCursor::appendStats( BSONObjBuilder& result ) {
+ recursive_scoped_lock lock(ccmutex);
+ result.appendNumber("totalOpen", clientCursorsById.size() );
+ result.appendNumber("clientCursors_size", (int) numCursors());
+ result.appendNumber("timedOut" , numberTimedOut);
+ unsigned pinned = 0;
+ unsigned notimeout = 0;
+ for ( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end(); i++ ) {
+ unsigned p = i->second->_pinValue;
+ if( p >= 100 )
+ pinned++;
+ else if( p > 0 )
+ notimeout++;
+ }
+ if( pinned )
+ result.append("pinned", pinned);
+ if( notimeout )
+ result.append("totalNoTimeout", notimeout);
+ }
+
+ // QUESTION: Restrict to the namespace from which this command was issued?
+ // Alternatively, make this command admin-only?
+ class CmdCursorInfo : public Command {
+ public:
+ CmdCursorInfo() : Command( "cursorInfo", true ) {}
+ virtual bool slaveOk() const { return true; }
+ virtual void help( stringstream& help ) const {
+ help << " example: { cursorInfo : 1 }";
+ }
+ virtual LockType locktype() const { return NONE; }
+ bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+ ClientCursor::appendStats( result );
+ return true;
+ }
+ } cmdCursorInfo;
+
+ struct Mem {
+ Mem() { res = virt = mapped = 0; }
+ int res;
+ int virt;
+ int mapped;
+ bool grew(const Mem& r) {
+ return (r.res && (((double)res)/r.res)>1.1 ) ||
+ (r.virt && (((double)virt)/r.virt)>1.1 ) ||
+ (r.mapped && (((double)mapped)/r.mapped)>1.1 );
+ }
+ };
+
+ /** called once a minute from killcursors thread */
+ void sayMemoryStatus() {
+ static time_t last;
+ static Mem mlast;
+ try {
+ ProcessInfo p;
+ if ( !cmdLine.quiet && p.supported() ) {
+ Mem m;
+ m.res = p.getResidentSize();
+ m.virt = p.getVirtualMemorySize();
+ m.mapped = (int) (MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ));
+ if( time(0)-last >= 300 || m.grew(mlast) ) {
+ log() << "mem (MB) res:" << m.res << " virt:" << m.virt << " mapped:" << m.mapped << endl;
+ if( m.virt - (cmdLine.dur?2:1)*m.mapped > 5000 ) {
+ ONCE log() << "warning virtual/mapped memory differential is large. journaling:" << cmdLine.dur << endl;
+ }
+ last = time(0);
+ mlast = m;
+ }
+ }
+ }
+ catch(...) {
+ log() << "ProcessInfo exception" << endl;
+ }
+ }
+
+ /** thread for timing out old cursors */
+ void ClientCursorMonitor::run() {
+ Client::initThread("clientcursormon");
+ Client& client = cc();
+ Timer t;
+ const int Secs = 4;
+ unsigned n = 0;
+ while ( ! inShutdown() ) {
+ ClientCursor::idleTimeReport( t.millisReset() );
+ sleepsecs(Secs);
+ if( ++n % (60/4) == 0 /*once a minute*/ ) {
+ sayMemoryStatus();
+ }
+ }
+ client.shutdown();
+ }
+
+ void ClientCursor::find( const string& ns , set<CursorId>& all ) {
+ recursive_scoped_lock lock(ccmutex);
+
+ for ( CCById::iterator i=clientCursorsById.begin(); i!=clientCursorsById.end(); ++i ) {
+ if ( i->second->_ns == ns )
+ all.insert( i->first );
+ }
+ }
+
+ int ClientCursor::erase(int n, long long *ids) {
+ int found = 0;
+ for ( int i = 0; i < n; i++ ) {
+ if ( erase(ids[i]) )
+ found++;
+
+ if ( inShutdown() )
+ break;
+ }
+ return found;
+
+ }
+
+ ClientCursorMonitor clientCursorMonitor;
+
+} // namespace mongo
diff --git a/src/mongo/db/clientcursor.h b/src/mongo/db/clientcursor.h
new file mode 100644
index 00000000000..e570820f62c
--- /dev/null
+++ b/src/mongo/db/clientcursor.h
@@ -0,0 +1,430 @@
+/* clientcursor.h */
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* Cursor -- and its derived classes -- are our internal cursors.
+
+ ClientCursor is a wrapper that represents a cursorid from our database
+ application's perspective.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "cursor.h"
+#include "jsobj.h"
+#include "../util/net/message.h"
+#include "../util/net/listen.h"
+#include "../util/background.h"
+#include "diskloc.h"
+#include "dbhelpers.h"
+#include "matcher.h"
+#include "../client/dbclient.h"
+#include "projection.h"
+#include "s/d_chunk_manager.h"
+
+namespace mongo {
+
+ typedef long long CursorId; /* passed to the client so it can send back on getMore */
+ class Cursor; /* internal server cursor base class */
+ class ClientCursor;
+ class ParsedQuery;
+
+ struct ByLocKey {
+
+ ByLocKey( const DiskLoc & l , const CursorId& i ) : loc(l), id(i) {}
+
+ static ByLocKey min( const DiskLoc& l ) { return ByLocKey( l , numeric_limits<long long>::min() ); }
+ static ByLocKey max( const DiskLoc& l ) { return ByLocKey( l , numeric_limits<long long>::max() ); }
+
+ bool operator<( const ByLocKey &other ) const {
+ int x = loc.compare( other.loc );
+ if ( x )
+ return x < 0;
+ return id < other.id;
+ }
+
+ DiskLoc loc;
+ CursorId id;
+
+ };
+
+ /* todo: make this map be per connection. this will prevent cursor hijacking security attacks perhaps.
+ * ERH: 9/2010 this may not work since some drivers send getMore over a different connection
+ */
+ typedef map<CursorId, ClientCursor*> CCById;
+ typedef map<ByLocKey, ClientCursor*> CCByLoc;
+
+ extern BSONObj id_obj;
+
+ class ClientCursor {
+ friend class CmdCursorInfo;
+ public:
+ static void assertNoCursors();
+
+ /* use this to assure we don't in the background time out cursor while it is under use.
+ if you are using noTimeout() already, there is no risk anyway.
+ Further, this mechanism guards against two getMore requests on the same cursor executing
+ at the same time - which might be bad. That should never happen, but if a client driver
+ had a bug, it could (or perhaps some sort of attack situation).
+ */
+ class Pointer : boost::noncopyable {
+ ClientCursor *_c;
+ public:
+ ClientCursor * c() { return _c; }
+ void release() {
+ if( _c ) {
+ assert( _c->_pinValue >= 100 );
+ _c->_pinValue -= 100;
+ _c = 0;
+ }
+ }
+ /**
+ * call this if during a yield, the cursor got deleted
+ * if so, we don't want to use the point address
+ */
+ void deleted() {
+ _c = 0;
+ }
+ ~Pointer() { release(); }
+ Pointer(long long cursorid) {
+ recursive_scoped_lock lock(ccmutex);
+ _c = ClientCursor::find_inlock(cursorid, true);
+ if( _c ) {
+ if( _c->_pinValue >= 100 ) {
+ _c = 0;
+ uasserted(12051, "clientcursor already in use? driver problem?");
+ }
+ _c->_pinValue += 100;
+ }
+ }
+ };
+
+ // This object assures safe and reliable cleanup of the ClientCursor.
+ // The implementation assumes that there will be no duplicate ids among cursors
+ // (which is assured if cursors must last longer than 1 second).
+ class CleanupPointer : boost::noncopyable {
+ public:
+ CleanupPointer() : _c( 0 ), _id( -1 ) {}
+ void reset( ClientCursor *c = 0 ) {
+ if ( c == _c )
+ return;
+ if ( _c ) {
+ // be careful in case cursor was deleted by someone else
+ ClientCursor::erase( _id );
+ }
+ if ( c ) {
+ _c = c;
+ _id = c->_cursorid;
+ }
+ else {
+ _c = 0;
+ _id = -1;
+ }
+ }
+ ~CleanupPointer() {
+ DESTRUCTOR_GUARD ( reset(); );
+ }
+ operator bool() { return _c; }
+ ClientCursor * operator-> () { return _c; }
+ private:
+ ClientCursor *_c;
+ CursorId _id;
+ };
+
+ ClientCursor(int queryOptions, const shared_ptr<Cursor>& c, const string& ns, BSONObj query = BSONObj() );
+
+ ~ClientCursor();
+
+ // *************** basic accessors *******************
+
+ CursorId cursorid() const { return _cursorid; }
+ string ns() const { return _ns; }
+ Database * db() const { return _db; }
+ const BSONObj& query() const { return _query; }
+ int queryOptions() const { return _queryOptions; }
+
+ DiskLoc lastLoc() const { return _lastLoc; }
+
+ /* Get rid of cursors for namespaces 'ns'. When dropping a db, ns is "dbname."
+ Used by drop, dropIndexes, dropDatabase.
+ */
+ static void invalidate(const char *ns);
+
+ /**
+ * @param microsToSleep -1 : ask client
+ * >=0 : sleep for that amount
+ * @param recordToLoad after yielding lock, load this record with only mmutex
+ * do a dbtemprelease
+ * note: caller should check matcher.docMatcher().atomic() first and not yield if atomic -
+ * we don't do herein as this->matcher (above) is only initialized for true queries/getmore.
+ * (ie not set for remote/update)
+ * @return if the cursor is still valid.
+ * if false is returned, then this ClientCursor should be considered deleted -
+ * in fact, the whole database could be gone.
+ */
+ bool yield( int microsToSleep = -1 , Record * recordToLoad = 0 );
+
+ enum RecordNeeds {
+ DontNeed = -1 , MaybeCovered = 0 , WillNeed = 100
+ };
+
+ /**
+ * @param needRecord whether or not the next record has to be read from disk for sure
+ * if this is true, will yield of next record isn't in memory
+ * @param yielded true if a yield occurred, and potentially if a yield did not occur
+ * @return same as yield()
+ */
+ bool yieldSometimes( RecordNeeds need, bool *yielded = 0 );
+
+ static int suggestYieldMicros();
+ static void staticYield( int micros , const StringData& ns , Record * rec );
+
+ struct YieldData { CursorId _id; bool _doingDeletes; };
+ bool prepareToYield( YieldData &data );
+ static bool recoverFromYield( const YieldData &data );
+
+ struct YieldLock : boost::noncopyable {
+ explicit YieldLock( ptr<ClientCursor> cc )
+ : _canYield(cc->_c->supportYields()) {
+ if ( _canYield ) {
+ cc->prepareToYield( _data );
+ _unlock.reset(new dbtempreleasecond());
+ }
+ }
+ ~YieldLock() {
+ if ( _unlock ) {
+ log( LL_WARNING ) << "ClientCursor::YieldLock not closed properly" << endl;
+ relock();
+ }
+ }
+ bool stillOk() {
+ if ( ! _canYield )
+ return true;
+ relock();
+ return ClientCursor::recoverFromYield( _data );
+ }
+ void relock() {
+ _unlock.reset();
+ }
+ private:
+ const bool _canYield;
+ YieldData _data;
+ scoped_ptr<dbtempreleasecond> _unlock;
+ };
+
+ // --- some pass through helpers for Cursor ---
+
+ Cursor* c() const { return _c.get(); }
+ int pos() const { return _pos; }
+
+ void incPos( int n ) { _pos += n; } // TODO: this is bad
+ void setPos( int n ) { _pos = n; } // TODO : this is bad too
+
+ BSONObj indexKeyPattern() { return _c->indexKeyPattern(); }
+ bool modifiedKeys() const { return _c->modifiedKeys(); }
+ bool isMultiKey() const { return _c->isMultiKey(); }
+
+ bool ok() { return _c->ok(); }
+ bool advance() { return _c->advance(); }
+ BSONObj current() { return _c->current(); }
+ DiskLoc currLoc() { return _c->currLoc(); }
+ BSONObj currKey() const { return _c->currKey(); }
+
+ /**
+ * same as BSONObj::getFieldsDotted
+ * if it can be retrieved from key, it is
+ * @param holder keeps the currKey in scope by keeping a reference to it here. generally you'll want
+ * holder and ret to destruct about the same time.
+ * @return if this was retrieved from key
+ */
+ bool getFieldsDotted( const string& name, BSONElementSet &ret, BSONObj& holder );
+
+ /**
+ * same as BSONObj::getFieldDotted
+ * if it can be retrieved from key, it is
+ * @return if this was retrieved from key
+ */
+ BSONElement getFieldDotted( const string& name , BSONObj& holder , bool * fromKey = 0 ) ;
+
+ /** extract items from object which match a pattern object.
+ * e.g., if pattern is { x : 1, y : 1 }, builds an object with
+ * x and y elements of this object, if they are present.
+ * returns elements with original field names
+ * NOTE: copied from BSONObj::extractFields
+ */
+ BSONObj extractFields(const BSONObj &pattern , bool fillWithNull = false) ;
+
+ bool currentIsDup() { return _c->getsetdup( _c->currLoc() ); }
+
+ bool currentMatches() {
+ if ( ! _c->matcher() )
+ return true;
+ return _c->matcher()->matchesCurrent( _c.get() );
+ }
+
+ void setChunkManager( ShardChunkManagerPtr manager ){ _chunkManager = manager; }
+ ShardChunkManagerPtr getChunkManager(){ return _chunkManager; }
+
+ private:
+ void setLastLoc_inlock(DiskLoc);
+
+ static ClientCursor* find_inlock(CursorId id, bool warn = true) {
+ CCById::iterator it = clientCursorsById.find(id);
+ if ( it == clientCursorsById.end() ) {
+ if ( warn )
+ OCCASIONALLY out() << "ClientCursor::find(): cursor not found in map " << id << " (ok after a drop)\n";
+ return 0;
+ }
+ return it->second;
+ }
+ public:
+ static ClientCursor* find(CursorId id, bool warn = true) {
+ recursive_scoped_lock lock(ccmutex);
+ ClientCursor *c = find_inlock(id, warn);
+ // if this asserts, your code was not thread safe - you either need to set no timeout
+ // for the cursor or keep a ClientCursor::Pointer in scope for it.
+ massert( 12521, "internal error: use of an unlocked ClientCursor", c == 0 || c->_pinValue );
+ return c;
+ }
+
+ static bool erase(CursorId id) {
+ recursive_scoped_lock lock(ccmutex);
+ ClientCursor *cc = find_inlock(id);
+ if ( cc ) {
+ assert( cc->_pinValue < 100 ); // you can't still have an active ClientCursor::Pointer
+ delete cc;
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * @return number of cursors found
+ */
+ static int erase( int n , long long * ids );
+
+ /* call when cursor's location changes so that we can update the
+ cursorsbylocation map. if you are locked and internally iterating, only
+ need to call when you are ready to "unlock".
+ */
+ void updateLocation();
+
+ void mayUpgradeStorage() {
+ /* if ( !ids_.get() )
+ return;
+ stringstream ss;
+ ss << ns << "." << cursorid;
+ ids_->mayUpgradeStorage( ss.str() );*/
+ }
+
+ /**
+ * @param millis amount of idle passed time since last call
+ */
+ bool shouldTimeout( unsigned millis );
+
+ void storeOpForSlave( DiskLoc last );
+ void updateSlaveLocation( CurOp& curop );
+
+ unsigned idleTime() const { return _idleAgeMillis; }
+
+ void setDoingDeletes( bool doingDeletes ) {_doingDeletes = doingDeletes; }
+
+ void slaveReadTill( const OpTime& t ) { _slaveReadTill = t; }
+
+ public: // static methods
+
+ static void idleTimeReport(unsigned millis);
+
+ static void appendStats( BSONObjBuilder& result );
+ static unsigned numCursors() { return clientCursorsById.size(); }
+ static void informAboutToDeleteBucket(const DiskLoc& b);
+ static void aboutToDelete(const DiskLoc& dl);
+ static void find( const string& ns , set<CursorId>& all );
+
+
+ private: // methods
+
+ // cursors normally timeout after an inactivy period to prevent excess memory use
+ // setting this prevents timeout of the cursor in question.
+ void noTimeout() { _pinValue++; }
+
+ CCByLoc& byLoc() { return _db->ccByLoc; }
+
+ Record* _recordForYield( RecordNeeds need );
+
+ private:
+
+ CursorId _cursorid;
+
+ const string _ns;
+ Database * _db;
+
+ const shared_ptr<Cursor> _c;
+ map<string,int> _indexedFields; // map from indexed field to offset in key object
+ int _pos; // # objects into the cursor so far
+
+ const BSONObj _query; // used for logging diags only; optional in constructor
+ int _queryOptions; // see enum QueryOptions dbclient.h
+
+ OpTime _slaveReadTill;
+
+ DiskLoc _lastLoc; // use getter and setter not this (important)
+ unsigned _idleAgeMillis; // how long has the cursor been around, relative to server idle time
+
+ /* 0 = normal
+ 1 = no timeout allowed
+ 100 = in use (pinned) -- see Pointer class
+ */
+ unsigned _pinValue;
+
+ bool _doingDeletes; // when true we are the delete and aboutToDelete shouldn't manipulate us
+ ElapsedTracker _yieldSometimesTracker;
+
+ ShardChunkManagerPtr _chunkManager;
+
+ public:
+ shared_ptr<ParsedQuery> pq;
+ shared_ptr<Projection> fields; // which fields query wants returned
+ Message originalMessage; // this is effectively an auto ptr for data the matcher points to
+
+
+
+ private: // static members
+
+ static CCById clientCursorsById;
+ static long long numberTimedOut;
+ static boost::recursive_mutex& ccmutex; // must use this for all statics above!
+ static CursorId allocCursorId_inlock();
+
+ };
+
+ class ClientCursorMonitor : public BackgroundJob {
+ public:
+ string name() const { return "ClientCursorMonitor"; }
+ void run();
+ };
+
+} // namespace mongo
+
+// ClientCursor should only be used with auto_ptr because it needs to be
+// release()ed after a yield if stillOk() returns false and these pointer types
+// do not support releasing. This will prevent them from being used accidentally
+namespace boost{
+ template<> class scoped_ptr<mongo::ClientCursor> {};
+ template<> class shared_ptr<mongo::ClientCursor> {};
+}
diff --git a/src/mongo/db/cloner.cpp b/src/mongo/db/cloner.cpp
new file mode 100644
index 00000000000..e35ae95052d
--- /dev/null
+++ b/src/mongo/db/cloner.cpp
@@ -0,0 +1,763 @@
+// cloner.cpp - copy a database (export/import basically)
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "cloner.h"
+#include "pdfile.h"
+#include "../client/dbclient.h"
+#include "../bson/util/builder.h"
+#include "jsobj.h"
+#include "ops/query.h"
+#include "commands.h"
+#include "db.h"
+#include "instance.h"
+#include "repl.h"
+
+namespace mongo {
+
+ BSONElement getErrField(const BSONObj& o);
+
+ void ensureHaveIdIndex(const char *ns);
+
+ bool replAuthenticate(DBClientBase *);
+
+ /** Selectively release the mutex based on a parameter. */
+ class dbtempreleaseif {
+ public:
+ dbtempreleaseif( bool release ) : _impl( release ? new dbtemprelease() : 0 ) {}
+ private:
+ shared_ptr< dbtemprelease > _impl;
+ };
+
+ void mayInterrupt( bool mayBeInterrupted ) {
+ if ( mayBeInterrupted ) {
+ killCurrentOp.checkForInterrupt( false );
+ }
+ }
+
+ class Cloner: boost::noncopyable {
+ auto_ptr< DBClientWithCommands > conn;
+ void copy(const char *from_ns, const char *to_ns, bool isindex, bool logForRepl,
+ bool masterSameProcess, bool slaveOk, bool mayYield, bool mayBeInterrupted, Query q = Query());
+ struct Fun;
+ public:
+ Cloner() { }
+
+ /* slaveOk - if true it is ok if the source of the data is !ismaster.
+ useReplAuth - use the credentials we normally use as a replication slave for the cloning
+ snapshot - use $snapshot mode for copying collections. note this should not be used when it isn't required, as it will be slower.
+ for example repairDatabase need not use it.
+ */
+ void setConnection( DBClientWithCommands *c ) { conn.reset( c ); }
+
+ /** copy the entire database */
+ bool go(const char *masterHost, string& errmsg, const string& fromdb, bool logForRepl, bool slaveOk, bool useReplAuth, bool snapshot, bool mayYield, bool mayBeInterrupted, int *errCode = 0);
+
+ bool copyCollection( const string& ns , const BSONObj& query , string& errmsg , bool mayYield, bool mayBeInterrupted, bool copyIndexes = true, bool logForRepl = true );
+ };
+
+ /* for index info object:
+ { "name" : "name_1" , "ns" : "foo.index3" , "key" : { "name" : 1.0 } }
+ we need to fix up the value in the "ns" parameter so that the name prefix is correct on a
+ copy to a new name.
+ */
+ BSONObj fixindex(BSONObj o) {
+ BSONObjBuilder b;
+ BSONObjIterator i(o);
+ while ( i.moreWithEOO() ) {
+ BSONElement e = i.next();
+ if ( e.eoo() )
+ break;
+
+ // for now, skip the "v" field so that v:0 indexes will be upgraded to v:1
+ if ( string("v") == e.fieldName() ) {
+ continue;
+ }
+
+ if ( string("ns") == e.fieldName() ) {
+ uassert( 10024 , "bad ns field for index during dbcopy", e.type() == String);
+ const char *p = strchr(e.valuestr(), '.');
+ uassert( 10025 , "bad ns field for index during dbcopy [2]", p);
+ string newname = cc().database()->name + p;
+ b.append("ns", newname);
+ }
+ else
+ b.append(e);
+ }
+ BSONObj res= b.obj();
+
+ /* if( mod ) {
+ out() << "before: " << o.toString() << endl;
+ o.dump();
+ out() << "after: " << res.toString() << endl;
+ res.dump();
+ }*/
+
+ return res;
+ }
+
+ struct Cloner::Fun {
+ Fun() : lastLog(0) { }
+ time_t lastLog;
+ void operator()( DBClientCursorBatchIterator &i ) {
+ mongolock l( true );
+ if ( context ) {
+ context->relocked();
+ }
+
+ while( i.moreInCurrentBatch() ) {
+ if ( n % 128 == 127 /*yield some*/ ) {
+ time_t now = time(0);
+ if( now - lastLog >= 60 ) {
+ // report progress
+ if( lastLog )
+ log() << "clone " << to_collection << ' ' << n << endl;
+ lastLog = now;
+ }
+ mayInterrupt( _mayBeInterrupted );
+ dbtempreleaseif t( _mayYield );
+ }
+
+ BSONObj tmp = i.nextSafe();
+
+ /* assure object is valid. note this will slow us down a little. */
+ if ( !tmp.valid() ) {
+ stringstream ss;
+ ss << "Cloner: skipping corrupt object from " << from_collection;
+ BSONElement e = tmp.firstElement();
+ try {
+ e.validate();
+ ss << " firstElement: " << e;
+ }
+ catch( ... ) {
+ ss << " firstElement corrupt";
+ }
+ out() << ss.str() << endl;
+ continue;
+ }
+
+ ++n;
+
+ BSONObj js = tmp;
+ if ( isindex ) {
+ assert( strstr(from_collection, "system.indexes") );
+ js = fixindex(tmp);
+ storedForLater->push_back( js.getOwned() );
+ continue;
+ }
+
+ try {
+ theDataFileMgr.insertWithObjMod(to_collection, js);
+ if ( logForRepl )
+ logOp("i", to_collection, js);
+
+ getDur().commitIfNeeded();
+ }
+ catch( UserException& e ) {
+ log() << "warning: exception cloning object in " << from_collection << ' ' << e.what() << " obj:" << js.toString() << '\n';
+ }
+
+ RARELY if ( time( 0 ) - saveLast > 60 ) {
+ log() << n << " objects cloned so far from collection " << from_collection << endl;
+ saveLast = time( 0 );
+ }
+ }
+ }
+ int n;
+ bool isindex;
+ const char *from_collection;
+ const char *to_collection;
+ time_t saveLast;
+ list<BSONObj> *storedForLater;
+ bool logForRepl;
+ Client::Context *context;
+ bool _mayYield;
+ bool _mayBeInterrupted;
+ };
+
+ /* copy the specified collection
+ isindex - if true, this is system.indexes collection, in which we do some transformation when copying.
+ */
+ void Cloner::copy(const char *from_collection, const char *to_collection, bool isindex, bool logForRepl, bool masterSameProcess, bool slaveOk, bool mayYield, bool mayBeInterrupted, Query query) {
+ list<BSONObj> storedForLater;
+
+ Fun f;
+ f.n = 0;
+ f.isindex = isindex;
+ f.from_collection = from_collection;
+ f.to_collection = to_collection;
+ f.saveLast = time( 0 );
+ f.storedForLater = &storedForLater;
+ f.logForRepl = logForRepl;
+ f._mayYield = mayYield;
+ f._mayBeInterrupted = mayBeInterrupted;
+
+ int options = QueryOption_NoCursorTimeout | ( slaveOk ? QueryOption_SlaveOk : 0 );
+ {
+ f.context = cc().getContext();
+ mayInterrupt( mayBeInterrupted );
+ dbtempreleaseif r( mayYield );
+ DBClientConnection *remote = dynamic_cast< DBClientConnection* >( conn.get() );
+ if ( remote ) {
+ remote->query( boost::function<void(DBClientCursorBatchIterator &)>( f ), from_collection, query, 0, options );
+ }
+ else {
+ // there is no exhaust mode for direct client, so we have this hack
+ auto_ptr<DBClientCursor> c = conn->query( from_collection, query, 0, 0, 0, options );
+ assert( c.get() );
+ while( c->more() ) {
+ DBClientCursorBatchIterator i( *c );
+ f( i );
+ }
+ }
+ }
+
+ if ( storedForLater.size() ) {
+ for ( list<BSONObj>::iterator i = storedForLater.begin(); i!=storedForLater.end(); i++ ) {
+ BSONObj js = *i;
+ try {
+ theDataFileMgr.insertWithObjMod(to_collection, js);
+ if ( logForRepl )
+ logOp("i", to_collection, js);
+
+ getDur().commitIfNeeded();
+ }
+ catch( UserException& e ) {
+ log() << "warning: exception cloning object in " << from_collection << ' ' << e.what() << " obj:" << js.toString() << '\n';
+ }
+ }
+ }
+ }
+
+ bool copyCollectionFromRemote(const string& host, const string& ns, string& errmsg) {
+ Cloner c;
+
+ DBClientConnection *conn = new DBClientConnection();
+ // cloner owns conn in auto_ptr
+ c.setConnection(conn);
+ uassert(15908, errmsg, conn->connect(host, errmsg) && replAuthenticate(conn));
+
+ return c.copyCollection(ns, BSONObj(), errmsg, true, false, /*copyIndexes*/ true, false);
+ }
+
+ bool Cloner::copyCollection( const string& ns, const BSONObj& query, string& errmsg,
+ bool mayYield, bool mayBeInterrupted, bool copyIndexes, bool logForRepl ) {
+
+ writelock lk(ns); // TODO: make this lower down
+ Client::Context ctx(ns);
+
+ {
+ // config
+ string temp = ctx.db()->name + ".system.namespaces";
+ BSONObj config = conn->findOne( temp , BSON( "name" << ns ) );
+ if ( config["options"].isABSONObj() )
+ if ( ! userCreateNS( ns.c_str() , config["options"].Obj() , errmsg, logForRepl , 0 ) )
+ return false;
+ }
+
+ {
+ // main data
+ copy( ns.c_str() , ns.c_str() , /*isindex*/false , logForRepl , false , true , mayYield, mayBeInterrupted, Query(query).snapshot() );
+ }
+
+ /* TODO : copyIndexes bool does not seem to be implemented! */
+ if( !copyIndexes ) {
+ log() << "ERROR copy collection copyIndexes not implemented? " << ns << endl;
+ }
+
+ {
+ // indexes
+ string temp = ctx.db()->name + ".system.indexes";
+ copy( temp.c_str() , temp.c_str() , /*isindex*/true , logForRepl , false , true , mayYield, mayBeInterrupted, BSON( "ns" << ns ) );
+ }
+ getDur().commitIfNeeded();
+ return true;
+ }
+
+ extern bool inDBRepair;
+ void ensureIdIndexForNewNs(const char *ns);
+
+ bool Cloner::go(const char *masterHost, string& errmsg, const string& fromdb, bool logForRepl, bool slaveOk, bool useReplAuth, bool snapshot, bool mayYield, bool mayBeInterrupted, int *errCode) {
+ if ( errCode ) {
+ *errCode = 0;
+ }
+ massert( 10289 , "useReplAuth is not written to replication log", !useReplAuth || !logForRepl );
+
+ string todb = cc().database()->name;
+ stringstream a,b;
+ a << "localhost:" << cmdLine.port;
+ b << "127.0.0.1:" << cmdLine.port;
+ bool masterSameProcess = ( a.str() == masterHost || b.str() == masterHost );
+ if ( masterSameProcess ) {
+ if ( fromdb == todb && cc().database()->path == dbpath ) {
+ // guard against an "infinite" loop
+ /* if you are replicating, the local.sources config may be wrong if you get this */
+ errmsg = "can't clone from self (localhost).";
+ return false;
+ }
+ }
+ /* todo: we can put these releases inside dbclient or a dbclient specialization.
+ or just wait until we get rid of global lock anyway.
+ */
+ string ns = fromdb + ".system.namespaces";
+ list<BSONObj> toClone;
+ {
+ mayInterrupt( mayBeInterrupted );
+ dbtempreleaseif r( mayYield );
+
+ // just using exhaust for collection copying right now
+ auto_ptr<DBClientCursor> c;
+ {
+ if ( conn.get() ) {
+ // nothing to do
+ }
+ else if ( !masterSameProcess ) {
+ ConnectionString cs = ConnectionString::parse( masterHost, errmsg );
+ auto_ptr<DBClientBase> con( cs.connect( errmsg ));
+ if ( !con.get() )
+ return false;
+ if( !replAuthenticate(con.get()) )
+ return false;
+
+ conn = con;
+ }
+ else {
+ conn.reset( new DBDirectClient() );
+ }
+ // todo: if snapshot (bool param to this func) is true, we need to snapshot this query?
+ // only would be relevant if a thousands of collections -- maybe even then it is hard
+ // to exceed a single cursor batch.
+ // for repl it is probably ok as we apply oplog section after the clone (i.e. repl
+ // doesnt not use snapshot=true).
+ c = conn->query( ns.c_str(), BSONObj(), 0, 0, 0, slaveOk ? QueryOption_SlaveOk : 0 );
+ }
+
+ if ( c.get() == 0 ) {
+ errmsg = "query failed " + ns;
+ return false;
+ }
+
+ if ( c->more() ) {
+ BSONObj first = c->next();
+ if( !getErrField(first).eoo() ) {
+ if ( errCode ) {
+ *errCode = first.getIntField("code");
+ }
+ errmsg = "query failed " + ns;
+ return false;
+ }
+ c->putBack( first );
+ }
+
+ while ( c->more() ) {
+ BSONObj collection = c->next();
+
+ log(2) << "\t cloner got " << collection << endl;
+
+ BSONElement e = collection.getField("name");
+ if ( e.eoo() ) {
+ string s = "bad system.namespaces object " + collection.toString();
+ massert( 10290 , s.c_str(), false);
+ }
+ assert( !e.eoo() );
+ assert( e.type() == String );
+ const char *from_name = e.valuestr();
+
+ if( strstr(from_name, ".system.") ) {
+ /* system.users and s.js is cloned -- but nothing else from system.
+ * system.indexes is handled specially at the end*/
+ if( legalClientSystemNS( from_name , true ) == 0 ) {
+ log(2) << "\t\t not cloning because system collection" << endl;
+ continue;
+ }
+ }
+ if( ! NamespaceString::normal( from_name ) ) {
+ log(2) << "\t\t not cloning because has $ " << endl;
+ continue;
+ }
+ toClone.push_back( collection.getOwned() );
+ }
+ }
+
+ for ( list<BSONObj>::iterator i=toClone.begin(); i != toClone.end(); i++ ) {
+ {
+ mayInterrupt( mayBeInterrupted );
+ dbtempreleaseif r( mayYield );
+ }
+ BSONObj collection = *i;
+ log(2) << " really will clone: " << collection << endl;
+ const char * from_name = collection["name"].valuestr();
+ BSONObj options = collection.getObjectField("options");
+
+ /* change name "<fromdb>.collection" -> <todb>.collection */
+ const char *p = strchr(from_name, '.');
+ assert(p);
+ string to_name = todb + p;
+
+ bool wantIdIndex = false;
+ {
+ string err;
+ const char *toname = to_name.c_str();
+ /* we defer building id index for performance - building it in batch is much faster */
+ userCreateNS(toname, options, err, logForRepl, &wantIdIndex);
+ }
+ log(1) << "\t\t cloning " << from_name << " -> " << to_name << endl;
+ Query q;
+ if( snapshot )
+ q.snapshot();
+ copy(from_name, to_name.c_str(), false, logForRepl, masterSameProcess, slaveOk, mayYield, mayBeInterrupted, q);
+
+ if( wantIdIndex ) {
+ /* we need dropDups to be true as we didn't do a true snapshot and this is before applying oplog operations
+ that occur during the initial sync. inDBRepair makes dropDups be true.
+ */
+ bool old = inDBRepair;
+ try {
+ inDBRepair = true;
+ ensureIdIndexForNewNs(to_name.c_str());
+ inDBRepair = old;
+ }
+ catch(...) {
+ inDBRepair = old;
+ throw;
+ }
+ }
+ }
+
+ // now build the indexes
+
+ string system_indexes_from = fromdb + ".system.indexes";
+ string system_indexes_to = todb + ".system.indexes";
+ /* [dm]: is the ID index sometimes not called "_id_"? There is other code in the system that looks for a "_id" prefix
+ rather than this exact value. we should standardize. OR, remove names - which is in the bugdb. Anyway, this
+ is dubious here at the moment.
+ */
+ // won't need a snapshot of the query of system.indexes as there can never be very many.
+ copy(system_indexes_from.c_str(), system_indexes_to.c_str(), true, logForRepl, masterSameProcess, slaveOk, mayYield, mayBeInterrupted, BSON( "name" << NE << "_id_" ) );
+
+ return true;
+ }
+
+ bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication,
+ bool slaveOk, bool useReplAuth, bool snapshot, bool mayYield, bool mayBeInterrupted, int *errCode) {
+ Cloner c;
+ return c.go(masterHost, errmsg, fromdb, logForReplication, slaveOk, useReplAuth, snapshot, mayYield, mayBeInterrupted, errCode);
+ }
+
+ /* Usage:
+ mydb.$cmd.findOne( { clone: "fromhost" } );
+ */
+ class CmdClone : public Command {
+ public:
+ virtual bool slaveOk() const {
+ return false;
+ }
+ virtual LockType locktype() const { return WRITE; }
+ virtual void help( stringstream &help ) const {
+ help << "clone this database from an instance of the db on another host\n";
+ help << "{ clone : \"host13\" }";
+ }
+ CmdClone() : Command("clone") { }
+ virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ string from = cmdObj.getStringField("clone");
+ if ( from.empty() )
+ return false;
+ /* replication note: we must logOp() not the command, but the cloned data -- if the slave
+ were to clone it would get a different point-in-time and not match.
+ */
+ return cloneFrom(from.c_str(), errmsg, dbname,
+ /*logForReplication=*/!fromRepl, /*slaveOk*/false, /*usereplauth*/false, /*snapshot*/true, /*mayYield*/true, /*mayBeInterrupted*/false);
+ }
+ } cmdclone;
+
+ class CmdCloneCollection : public Command {
+ public:
+ virtual bool slaveOk() const {
+ return false;
+ }
+ virtual LockType locktype() const { return NONE; }
+ CmdCloneCollection() : Command("cloneCollection") { }
+ virtual void help( stringstream &help ) const {
+ help << "{ cloneCollection: <namespace>, from: <host> [,query: <query_filter>] [,copyIndexes:<bool>] }"
+ "\nCopies a collection from one server to another. Do not use on a single server as the destination "
+ "is placed at the same db.collection (namespace) as the source.\n"
+ "Warning: the local copy of 'ns' is emptied before the copying begins. Any existing data will be lost there."
+ ;
+ }
+ virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ string fromhost = cmdObj.getStringField("from");
+ if ( fromhost.empty() ) {
+ errmsg = "missing 'from' parameter";
+ return false;
+ }
+ {
+ HostAndPort h(fromhost);
+ if( h.isSelf() ) {
+ errmsg = "can't cloneCollection from self";
+ return false;
+ }
+ }
+ string collection = cmdObj.getStringField("cloneCollection");
+ if ( collection.empty() ) {
+ errmsg = "bad 'cloneCollection' value";
+ return false;
+ }
+ BSONObj query = cmdObj.getObjectField("query");
+ if ( query.isEmpty() )
+ query = BSONObj();
+
+ BSONElement copyIndexesSpec = cmdObj.getField("copyindexes");
+ bool copyIndexes = copyIndexesSpec.isBoolean() ? copyIndexesSpec.boolean() : true;
+
+ log() << "cloneCollection. db:" << dbname << " collection:" << collection << " from: " << fromhost
+ << " query: " << query << " " << ( copyIndexes ? "" : ", not copying indexes" ) << endl;
+
+ Cloner c;
+ auto_ptr<DBClientConnection> myconn;
+ myconn.reset( new DBClientConnection() );
+ if ( ! myconn->connect( fromhost , errmsg ) )
+ return false;
+
+ c.setConnection( myconn.release() );
+
+ return c.copyCollection( collection , query, errmsg , true, false, copyIndexes );
+ }
+ } cmdclonecollection;
+
+
+ thread_specific_ptr< DBClientConnection > authConn_;
+ /* Usage:
+ admindb.$cmd.findOne( { copydbgetnonce: 1, fromhost: <hostname> } );
+ */
+ class CmdCopyDbGetNonce : public Command {
+ public:
+ CmdCopyDbGetNonce() : Command("copydbgetnonce") { }
+ virtual bool adminOnly() const {
+ return true;
+ }
+ virtual bool slaveOk() const {
+ return false;
+ }
+ virtual LockType locktype() const { return WRITE; }
+ virtual void help( stringstream &help ) const {
+ help << "get a nonce for subsequent copy db request from secure server\n";
+ help << "usage: {copydbgetnonce: 1, fromhost: <hostname>}";
+ }
+ virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ string fromhost = cmdObj.getStringField("fromhost");
+ if ( fromhost.empty() ) {
+ /* copy from self */
+ stringstream ss;
+ ss << "localhost:" << cmdLine.port;
+ fromhost = ss.str();
+ }
+ authConn_.reset( new DBClientConnection() );
+ BSONObj ret;
+ {
+ dbtemprelease t;
+ if ( !authConn_->connect( fromhost, errmsg ) )
+ return false;
+ if( !authConn_->runCommand( "admin", BSON( "getnonce" << 1 ), ret ) ) {
+ errmsg = "couldn't get nonce " + ret.toString();
+ return false;
+ }
+ }
+ result.appendElements( ret );
+ return true;
+ }
+ } cmdcopydbgetnonce;
+
+ /* Usage:
+ admindb.$cmd.findOne( { copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>[, username: <username>, nonce: <nonce>, key: <key>] } );
+ */
+ class CmdCopyDb : public Command {
+ public:
+ CmdCopyDb() : Command("copydb") { }
+ virtual bool adminOnly() const {
+ return true;
+ }
+ virtual bool slaveOk() const {
+ return false;
+ }
+ virtual LockType locktype() const { return WRITE; }
+ virtual void help( stringstream &help ) const {
+ help << "copy a database from another host to this host\n";
+ help << "usage: {copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>[, slaveOk: <bool>, username: <username>, nonce: <nonce>, key: <key>]}";
+ }
+ virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ bool slaveOk = cmdObj["slaveOk"].trueValue();
+ string fromhost = cmdObj.getStringField("fromhost");
+ if ( fromhost.empty() ) {
+ /* copy from self */
+ stringstream ss;
+ ss << "localhost:" << cmdLine.port;
+ fromhost = ss.str();
+ }
+ string fromdb = cmdObj.getStringField("fromdb");
+ string todb = cmdObj.getStringField("todb");
+ if ( fromhost.empty() || todb.empty() || fromdb.empty() ) {
+ errmsg = "parms missing - {copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>}";
+ return false;
+ }
+ Cloner c;
+ string username = cmdObj.getStringField( "username" );
+ string nonce = cmdObj.getStringField( "nonce" );
+ string key = cmdObj.getStringField( "key" );
+ if ( !username.empty() && !nonce.empty() && !key.empty() ) {
+ uassert( 13008, "must call copydbgetnonce first", authConn_.get() );
+ BSONObj ret;
+ {
+ dbtemprelease t;
+ if ( !authConn_->runCommand( fromdb, BSON( "authenticate" << 1 << "user" << username << "nonce" << nonce << "key" << key ), ret ) ) {
+ errmsg = "unable to login " + ret.toString();
+ return false;
+ }
+ }
+ c.setConnection( authConn_.release() );
+ }
+ Client::Context ctx(todb);
+ bool res = c.go(fromhost.c_str(), errmsg, fromdb, /*logForReplication=*/!fromRepl, slaveOk, /*replauth*/false, /*snapshot*/true, /*mayYield*/true, /*mayBeInterrupted*/ false);
+ return res;
+ }
+ } cmdcopydb;
+
+ class CmdRenameCollection : public Command {
+ public:
+ CmdRenameCollection() : Command( "renameCollection" ) {}
+ virtual bool adminOnly() const {
+ return true;
+ }
+ virtual bool requiresAuth() { return false; } // do our own auth
+ virtual bool slaveOk() const {
+ return false;
+ }
+ virtual LockType locktype() const { return WRITE; }
+ virtual bool logTheOp() {
+ return true; // can't log steps when doing fast rename within a db, so always log the op rather than individual steps comprising it.
+ }
+ virtual void help( stringstream &help ) const {
+ help << " example: { renameCollection: foo.a, to: bar.b }";
+ }
+ virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ string source = cmdObj.getStringField( name.c_str() );
+ string target = cmdObj.getStringField( "to" );
+ uassert(15967,"invalid collection name: " + target, NamespaceString::validCollectionName(target.c_str()));
+ if ( source.empty() || target.empty() ) {
+ errmsg = "invalid command syntax";
+ return false;
+ }
+
+ bool capped = false;
+ long long size = 0;
+ {
+ Client::Context ctx( source ); // auths against source
+ NamespaceDetails *nsd = nsdetails( source.c_str() );
+ uassert( 10026 , "source namespace does not exist", nsd );
+ capped = nsd->capped;
+ if ( capped )
+ for( DiskLoc i = nsd->firstExtent; !i.isNull(); i = i.ext()->xnext )
+ size += i.ext()->length;
+ }
+
+ Client::Context ctx( target ); //auths against target
+
+ if ( nsdetails( target.c_str() ) ) {
+ uassert( 10027 , "target namespace exists", cmdObj["dropTarget"].trueValue() );
+ BSONObjBuilder bb( result.subobjStart( "dropTarget" ) );
+ dropCollection( target , errmsg , bb );
+ bb.done();
+ if ( errmsg.size() > 0 )
+ return false;
+ }
+
+ {
+ char from[256];
+ nsToDatabase( source.c_str(), from );
+ char to[256];
+ nsToDatabase( target.c_str(), to );
+ if ( strcmp( from, to ) == 0 ) {
+ renameNamespace( source.c_str(), target.c_str() );
+ // make sure we drop counters etc
+ Top::global.collectionDropped( source );
+ return true;
+ }
+ }
+
+ BSONObjBuilder spec;
+ if ( capped ) {
+ spec.appendBool( "capped", true );
+ spec.append( "size", double( size ) );
+ }
+ if ( !userCreateNS( target.c_str(), spec.done(), errmsg, false ) )
+ return false;
+
+ auto_ptr< DBClientCursor > c;
+ DBDirectClient bridge;
+
+ {
+ c = bridge.query( source, BSONObj() );
+ }
+ while( 1 ) {
+ {
+ if ( !c->more() )
+ break;
+ }
+ BSONObj o = c->next();
+ theDataFileMgr.insertWithObjMod( target.c_str(), o );
+ }
+
+ char cl[256];
+ nsToDatabase( source.c_str(), cl );
+ string sourceIndexes = string( cl ) + ".system.indexes";
+ nsToDatabase( target.c_str(), cl );
+ string targetIndexes = string( cl ) + ".system.indexes";
+ {
+ c = bridge.query( sourceIndexes, QUERY( "ns" << source ) );
+ }
+ while( 1 ) {
+ {
+ if ( !c->more() )
+ break;
+ }
+ BSONObj o = c->next();
+ BSONObjBuilder b;
+ BSONObjIterator i( o );
+ while( i.moreWithEOO() ) {
+ BSONElement e = i.next();
+ if ( e.eoo() )
+ break;
+ if ( strcmp( e.fieldName(), "ns" ) == 0 ) {
+ b.append( "ns", target );
+ }
+ else {
+ b.append( e );
+ }
+ }
+ BSONObj n = b.done();
+ theDataFileMgr.insertWithObjMod( targetIndexes.c_str(), n );
+ }
+
+ {
+ Client::Context ctx( source );
+ dropCollection( source, errmsg, result );
+ }
+ return true;
+ }
+ } cmdrenamecollection;
+
+} // namespace mongo
diff --git a/src/mongo/db/cloner.h b/src/mongo/db/cloner.h
new file mode 100644
index 00000000000..130fea0fac1
--- /dev/null
+++ b/src/mongo/db/cloner.h
@@ -0,0 +1,39 @@
+// cloner.h - copy a database (export/import basically)
+
+/**
+ * Copyright (C) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "jsobj.h"
+
+namespace mongo {
+
+ /**
+ * @param slaveOk - if true it is ok if the source of the data is !ismaster.
+ * @param useReplAuth - use the credentials we normally use as a replication slave for the cloning
+ * @param snapshot - use $snapshot mode for copying collections. note this should not be used when it isn't required, as it will be slower.
+ * for example repairDatabase need not use it.
+ * @param errCode - If provided, this will be set on error to the server's error code. Currently
+ * this will only be set if there is an error in the initial system.namespaces query.
+ */
+ bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication,
+ bool slaveOk, bool useReplAuth, bool snapshot, bool mayYield,
+ bool mayBeInterrupted, int *errCode = 0);
+
+ bool copyCollectionFromRemote(const string& host, const string& ns, string& errmsg);
+
+} // namespace mongo
diff --git a/src/mongo/db/cmdline.cpp b/src/mongo/db/cmdline.cpp
new file mode 100644
index 00000000000..a9b0d7097ca
--- /dev/null
+++ b/src/mongo/db/cmdline.cpp
@@ -0,0 +1,519 @@
+// cmdline.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "cmdline.h"
+#include "commands.h"
+#include "../util/password.h"
+#include "../util/processinfo.h"
+#include "../util/net/listen.h"
+#include "security_common.h"
+#ifdef _WIN32
+#include <direct.h>
+#else
+#include <sys/types.h>
+#include <sys/wait.h>
+#endif
+#include "globals.h"
+
+#define MAX_LINE_LENGTH 256
+
+namespace po = boost::program_options;
+namespace fs = boost::filesystem;
+
+namespace mongo {
+
+ void setupSignals( bool inFork );
+ string getHostNameCached();
+ static BSONArray argvArray;
+ static BSONObj parsedOpts;
+
+ void CmdLine::addGlobalOptions( boost::program_options::options_description& general ,
+ boost::program_options::options_description& hidden ) {
+ /* support for -vv -vvvv etc. */
+ for (string s = "vv"; s.length() <= 12; s.append("v")) {
+ hidden.add_options()(s.c_str(), "verbose");
+ }
+
+ general.add_options()
+ ("help,h", "show this usage information")
+ ("version", "show version information")
+ ("config,f", po::value<string>(), "configuration file specifying additional options")
+ ("verbose,v", "be more verbose (include multiple times for more verbosity e.g. -vvvvv)")
+ ("quiet", "quieter output")
+ ("port", po::value<int>(&cmdLine.port), "specify port number")
+ ("bind_ip", po::value<string>(&cmdLine.bind_ip), "comma separated list of ip addresses to listen on - all local ips by default")
+ ("maxConns",po::value<int>(), "max number of simultaneous connections")
+ ("objcheck", "inspect client data for validity on receipt")
+ ("logpath", po::value<string>() , "log file to send write to instead of stdout - has to be a file, not directory" )
+ ("logappend" , "append to logpath instead of over-writing" )
+ ("pidfilepath", po::value<string>(), "full path to pidfile (if not set, no pidfile is created)")
+ ("keyFile", po::value<string>(), "private key for cluster authentication (only for replica sets)")
+#ifndef _WIN32
+ ("nounixsocket", "disable listening on unix sockets")
+ ("unixSocketPrefix", po::value<string>(), "alternative directory for UNIX domain sockets (defaults to /tmp)")
+ ("fork" , "fork server process" )
+ ("syslog" , "log to system's syslog facility instead of file or stdout" )
+#endif
+ ;
+
+ hidden.add_options()
+ ("cloud", po::value<string>(), "custom dynamic host naming")
+#ifdef MONGO_SSL
+ ("sslOnNormalPorts" , "use ssl on configured ports" )
+ ("sslPEMKeyFile" , po::value<string>(&cmdLine.sslPEMKeyFile), "PEM file for ssl" )
+ ("sslPEMKeyPassword" , new PasswordValue(&cmdLine.sslPEMKeyPassword) , "PEM file password" )
+#endif
+ ;
+
+ }
+
+
+#if defined(_WIN32)
+ void CmdLine::addWindowsOptions( boost::program_options::options_description& windows ,
+ boost::program_options::options_description& hidden ) {
+ windows.add_options()
+ ("install", "install mongodb service")
+ ("remove", "remove mongodb service")
+ ("reinstall", "reinstall mongodb service (equivilant of mongod --remove followed by mongod --install)")
+ ("serviceName", po::value<string>(), "windows service name")
+ ("serviceDisplayName", po::value<string>(), "windows service display name")
+ ("serviceDescription", po::value<string>(), "windows service description")
+ ("serviceUser", po::value<string>(), "user name service executes as")
+ ("servicePassword", po::value<string>(), "password used to authenticate serviceUser")
+ ;
+ hidden.add_options()("service", "start mongodb service");
+ }
+#endif
+
+ void CmdLine::parseConfigFile( istream &f, stringstream &ss ) {
+ string s;
+ char line[MAX_LINE_LENGTH];
+
+ while ( f ) {
+ f.getline(line, MAX_LINE_LENGTH);
+ s = line;
+ std::remove(s.begin(), s.end(), ' ');
+ std::remove(s.begin(), s.end(), '\t');
+ boost::to_upper(s);
+
+ if ( s.find( "FASTSYNC" ) != string::npos )
+ cout << "warning \"fastsync\" should not be put in your configuration file" << endl;
+
+ if ( s.c_str()[0] == '#' ) {
+ // skipping commented line
+ } else if ( s.find( "=FALSE" ) == string::npos ) {
+ ss << line << endl;
+ } else {
+ cout << "warning: remove or comment out this line by starting it with \'#\', skipping now : " << line << endl;
+ }
+ }
+ return;
+ }
+
+#ifndef _WIN32
+ // support for exit value propagation with fork
+ void launchSignal( int sig ) {
+ if ( sig == SIGUSR2 ) {
+ pid_t cur = getpid();
+
+ if ( cur == cmdLine.parentProc || cur == cmdLine.leaderProc ) {
+ // signal indicates successful start allowing us to exit
+ _exit(0);
+ }
+ }
+ }
+
+ void setupLaunchSignals() {
+ assert( signal(SIGUSR2 , launchSignal ) != SIG_ERR );
+ }
+
+
+ void CmdLine::launchOk() {
+ if ( cmdLine.doFork ) {
+ // killing leader will propagate to parent
+ assert( kill( cmdLine.leaderProc, SIGUSR2 ) == 0 );
+ }
+ }
+#endif
+
+ bool CmdLine::store( int argc , char ** argv ,
+ boost::program_options::options_description& visible,
+ boost::program_options::options_description& hidden,
+ boost::program_options::positional_options_description& positional,
+ boost::program_options::variables_map &params ) {
+
+
+ {
+ // setup binary name
+ cmdLine.binaryName = argv[0];
+ size_t i = cmdLine.binaryName.rfind( '/' );
+ if ( i != string::npos )
+ cmdLine.binaryName = cmdLine.binaryName.substr( i + 1 );
+
+ // setup cwd
+ char buffer[1024];
+#ifdef _WIN32
+ assert( _getcwd( buffer , 1000 ) );
+#else
+ assert( getcwd( buffer , 1000 ) );
+#endif
+ cmdLine.cwd = buffer;
+ }
+
+
+ /* don't allow guessing - creates ambiguities when some options are
+ * prefixes of others. allow long disguises and don't allow guessing
+ * to get away with our vvvvvvv trick. */
+ int style = (((po::command_line_style::unix_style ^
+ po::command_line_style::allow_guessing) |
+ po::command_line_style::allow_long_disguise) ^
+ po::command_line_style::allow_sticky);
+
+
+ try {
+
+ po::options_description all;
+ all.add( visible );
+ all.add( hidden );
+
+ po::store( po::command_line_parser(argc, argv)
+ .options( all )
+ .positional( positional )
+ .style( style )
+ .run(),
+ params );
+
+ if ( params.count("config") ) {
+ ifstream f( params["config"].as<string>().c_str() );
+ if ( ! f.is_open() ) {
+ cout << "ERROR: could not read from config file" << endl << endl;
+ cout << visible << endl;
+ return false;
+ }
+
+ stringstream ss;
+ CmdLine::parseConfigFile( f, ss );
+ po::store( po::parse_config_file( ss , all ) , params );
+ f.close();
+ }
+
+ po::notify(params);
+ }
+ catch (po::error &e) {
+ cout << "error command line: " << e.what() << endl;
+ cout << "use --help for help" << endl;
+ //cout << visible << endl;
+ return false;
+ }
+
+ if (params.count("verbose")) {
+ logLevel = 1;
+ }
+
+ for (string s = "vv"; s.length() <= 12; s.append("v")) {
+ if (params.count(s)) {
+ logLevel = s.length();
+ }
+ }
+
+ if (params.count("quiet")) {
+ cmdLine.quiet = true;
+ }
+
+ if ( params.count( "maxConns" ) ) {
+ int newSize = params["maxConns"].as<int>();
+ if ( newSize < 5 ) {
+ out() << "maxConns has to be at least 5" << endl;
+ dbexit( EXIT_BADOPTIONS );
+ }
+ else if ( newSize >= 10000000 ) {
+ out() << "maxConns can't be greater than 10000000" << endl;
+ dbexit( EXIT_BADOPTIONS );
+ }
+ connTicketHolder.resize( newSize );
+ }
+
+ if (params.count("objcheck")) {
+ cmdLine.objcheck = true;
+ }
+
+ string logpath;
+
+#ifndef _WIN32
+ if (params.count("unixSocketPrefix")) {
+ cmdLine.socket = params["unixSocketPrefix"].as<string>();
+ if (!fs::is_directory(cmdLine.socket)) {
+ cout << cmdLine.socket << " must be a directory" << endl;
+ ::exit(-1);
+ }
+ }
+
+ if (params.count("nounixsocket")) {
+ cmdLine.noUnixSocket = true;
+ }
+
+ if (params.count("fork")) {
+ cmdLine.doFork = true;
+ if ( ! params.count( "logpath" ) && ! params.count( "syslog" ) ) {
+ cout << "--fork has to be used with --logpath or --syslog" << endl;
+ ::exit(-1);
+ }
+
+ if ( params.count( "logpath" ) ) {
+ // test logpath
+ logpath = params["logpath"].as<string>();
+ assert( logpath.size() );
+ if ( logpath[0] != '/' ) {
+ logpath = cmdLine.cwd + "/" + logpath;
+ }
+ FILE * test = fopen( logpath.c_str() , "a" );
+ if ( ! test ) {
+ cout << "can't open [" << logpath << "] for log file: " << errnoWithDescription() << endl;
+ ::exit(-1);
+ }
+ fclose( test );
+ }
+
+ cout.flush();
+ cerr.flush();
+
+ cmdLine.parentProc = getpid();
+
+ // facilitate clean exit when child starts successfully
+ setupLaunchSignals();
+
+ pid_t c = fork();
+ if ( c ) {
+ int pstat;
+ waitpid(c, &pstat, 0);
+
+ if ( WIFEXITED(pstat) ) {
+ if ( ! WEXITSTATUS(pstat) ) {
+ cout << "child process started successfully, parent exiting" << endl;
+ }
+
+ _exit( WEXITSTATUS(pstat) );
+ }
+
+ _exit(50);
+ }
+
+ if ( chdir("/") < 0 ) {
+ cout << "Cant chdir() while forking server process: " << strerror(errno) << endl;
+ ::exit(-1);
+ }
+ setsid();
+
+ cmdLine.leaderProc = getpid();
+
+ pid_t c2 = fork();
+ if ( c2 ) {
+ int pstat;
+ cout << "forked process: " << c2 << endl;
+ waitpid(c2, &pstat, 0);
+
+ if ( WIFEXITED(pstat) ) {
+ _exit( WEXITSTATUS(pstat) );
+ }
+
+ _exit(51);
+ }
+
+ // stdout handled in initLogging
+ //fclose(stdout);
+ //freopen("/dev/null", "w", stdout);
+
+ fclose(stderr);
+ fclose(stdin);
+
+ FILE* f = freopen("/dev/null", "w", stderr);
+ if ( f == NULL ) {
+ cout << "Cant reassign stderr while forking server process: " << strerror(errno) << endl;
+ ::exit(-1);
+ }
+
+ f = freopen("/dev/null", "r", stdin);
+ if ( f == NULL ) {
+ cout << "Cant reassign stdin while forking server process: " << strerror(errno) << endl;
+ ::exit(-1);
+ }
+
+ setupCoreSignals();
+ setupSignals( true );
+ }
+
+ if (params.count("syslog")) {
+ StringBuilder sb(128);
+ sb << cmdLine.binaryName << "." << cmdLine.port;
+ Logstream::useSyslog( sb.str().c_str() );
+ }
+#endif
+ if (params.count("logpath")) {
+ if ( params.count("syslog") ) {
+ cout << "Cant use both a logpath and syslog " << endl;
+ ::exit(-1);
+ }
+
+ if ( logpath.size() == 0 )
+ logpath = params["logpath"].as<string>();
+ uassert( 10033 , "logpath has to be non-zero" , logpath.size() );
+ initLogging( logpath , params.count( "logappend" ) );
+ }
+
+ if ( params.count("pidfilepath")) {
+ writePidFile( params["pidfilepath"].as<string>() );
+ }
+
+ if (params.count("keyFile")) {
+ const string f = params["keyFile"].as<string>();
+
+ if (!setUpSecurityKey(f)) {
+ // error message printed in setUpPrivateKey
+ dbexit(EXIT_BADOPTIONS);
+ }
+
+ cmdLine.keyFile = true;
+ noauth = false;
+ }
+ else {
+ cmdLine.keyFile = false;
+ }
+
+#ifdef MONGO_SSL
+ if (params.count("sslOnNormalPorts") ) {
+ cmdLine.sslOnNormalPorts = true;
+
+ if ( cmdLine.sslPEMKeyPassword.size() == 0 ) {
+ log() << "need sslPEMKeyPassword" << endl;
+ dbexit(EXIT_BADOPTIONS);
+ }
+
+ if ( cmdLine.sslPEMKeyFile.size() == 0 ) {
+ log() << "need sslPEMKeyFile" << endl;
+ dbexit(EXIT_BADOPTIONS);
+ }
+
+ cmdLine.sslServerManager = new SSLManager( false );
+ cmdLine.sslServerManager->setupPEM( cmdLine.sslPEMKeyFile , cmdLine.sslPEMKeyPassword );
+ }
+
+ if ( cmdLine.sslPEMKeyFile.size() || cmdLine.sslPEMKeyPassword.size() ) {
+ log() << "need to enable sslOnNormalPorts" << endl;
+ dbexit(EXIT_BADOPTIONS);
+ }
+#endif
+
+ {
+ BSONObjBuilder b;
+ for (po::variables_map::const_iterator it(params.begin()), end(params.end()); it != end; it++){
+ if (!it->second.defaulted()){
+ const string& key = it->first;
+ const po::variable_value& value = it->second;
+ const type_info& type = value.value().type();
+
+ if (type == typeid(string)){
+ if (value.as<string>().empty())
+ b.appendBool(key, true); // boost po uses empty string for flags like --quiet
+ else
+ b.append(key, value.as<string>());
+ }
+ else if (type == typeid(int))
+ b.append(key, value.as<int>());
+ else if (type == typeid(double))
+ b.append(key, value.as<double>());
+ else if (type == typeid(bool))
+ b.appendBool(key, value.as<bool>());
+ else if (type == typeid(long))
+ b.appendNumber(key, (long long)value.as<long>());
+ else if (type == typeid(unsigned))
+ b.appendNumber(key, (long long)value.as<unsigned>());
+ else if (type == typeid(unsigned long long))
+ b.appendNumber(key, (long long)value.as<unsigned long long>());
+ else if (type == typeid(vector<string>))
+ b.append(key, value.as<vector<string> >());
+ else
+ b.append(key, "UNKNOWN TYPE: " + demangleName(type));
+ }
+ }
+ parsedOpts = b.obj();
+ }
+
+ {
+ BSONArrayBuilder b;
+ for (int i=0; i < argc; i++)
+ b << argv[i];
+ argvArray = b.arr();
+ }
+
+ return true;
+ }
+
+ void printCommandLineOpts() {
+ log() << "options: " << parsedOpts << endl;
+ }
+
+ void ignoreSignal( int sig ) {}
+
+ void setupCoreSignals() {
+#if !defined(_WIN32)
+ assert( signal(SIGUSR1 , rotateLogs ) != SIG_ERR );
+ assert( signal(SIGHUP , ignoreSignal ) != SIG_ERR );
+#endif
+ }
+
+ class CmdGetCmdLineOpts : Command {
+ public:
+ CmdGetCmdLineOpts(): Command("getCmdLineOpts") {}
+ void help(stringstream& h) const { h << "get argv"; }
+ virtual LockType locktype() const { return NONE; }
+ virtual bool adminOnly() const { return true; }
+ virtual bool slaveOk() const { return true; }
+
+ virtual bool run(const string&, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ result.append("argv", argvArray);
+ result.append("parsed", parsedOpts);
+ return true;
+ }
+
+ } cmdGetCmdLineOpts;
+
+ string prettyHostName() {
+ StringBuilder s(128);
+ s << getHostNameCached();
+ if( cmdLine.port != CmdLine::DefaultDBPort )
+ s << ':' << mongo::cmdLine.port;
+ return s.str();
+ }
+
+ casi< map<string,ParameterValidator*> * > pv_all (NULL);
+
+ ParameterValidator::ParameterValidator( const string& name ) : _name( name ) {
+ if ( ! pv_all)
+ pv_all.ref() = new map<string,ParameterValidator*>();
+ (*pv_all.ref())[_name] = this;
+ }
+
+ ParameterValidator * ParameterValidator::get( const string& name ) {
+ map<string,ParameterValidator*>::const_iterator i = pv_all.get()->find( name );
+ if ( i == pv_all.get()->end() )
+ return NULL;
+ return i->second;
+ }
+
+}
diff --git a/src/mongo/db/cmdline.h b/src/mongo/db/cmdline.h
new file mode 100644
index 00000000000..5fe6ceb1005
--- /dev/null
+++ b/src/mongo/db/cmdline.h
@@ -0,0 +1,203 @@
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "jsobj.h"
+
+namespace mongo {
+
+#ifdef MONGO_SSL
+ class SSLManager;
+#endif
+
+ /* command line options
+ */
+ /* concurrency: OK/READ */
+ struct CmdLine {
+
+ CmdLine();
+
+ string binaryName; // mongod or mongos
+ string cwd; // cwd of when process started
+
+ // this is suboptimal as someone could rename a binary. todo...
+ bool isMongos() const { return binaryName == "mongos"; }
+
+ int port; // --port
+ enum {
+ DefaultDBPort = 27017,
+ ConfigServerPort = 27019,
+ ShardServerPort = 27018
+ };
+ bool isDefaultPort() const { return port == DefaultDBPort; }
+
+ string bind_ip; // --bind_ip
+ bool rest; // --rest
+ bool jsonp; // --jsonp
+
+ string _replSet; // --replSet[/<seedlist>]
+ string ourSetName() const {
+ string setname;
+ size_t sl = _replSet.find('/');
+ if( sl == string::npos )
+ return _replSet;
+ return _replSet.substr(0, sl);
+ }
+ bool usingReplSets() const { return !_replSet.empty(); }
+
+ // for master/slave replication
+ string source; // --source
+ string only; // --only
+
+ bool quiet; // --quiet
+ bool noTableScan; // --notablescan no table scans allowed
+ bool prealloc; // --noprealloc no preallocation of data files
+ bool preallocj; // --nopreallocj no preallocation of journal files
+ bool smallfiles; // --smallfiles allocate smaller data files
+
+ bool configsvr; // --configsvr
+
+ bool quota; // --quota
+ int quotaFiles; // --quotaFiles
+ bool cpu; // --cpu show cpu time periodically
+
+ bool dur; // --dur durability (now --journal)
+ unsigned journalCommitInterval; // group/batch commit interval ms
+
+ /** --durOptions 7 dump journal and terminate without doing anything further
+ --durOptions 4 recover and terminate without listening
+ */
+ enum { // bits to be ORed
+ DurDumpJournal = 1, // dump diagnostics on the journal during recovery
+ DurScanOnly = 2, // don't do any real work, just scan and dump if dump specified
+ DurRecoverOnly = 4, // terminate after recovery step
+ DurParanoid = 8, // paranoid mode enables extra checks
+ DurAlwaysCommit = 16, // do a group commit every time the writelock is released
+ DurAlwaysRemap = 32, // remap the private view after every group commit (may lag to the next write lock acquisition, but will do all files then)
+ DurNoCheckSpace = 64 // don't check that there is enough room for journal files before startup (for diskfull tests)
+ };
+ int durOptions; // --durOptions <n> for debugging
+
+ bool objcheck; // --objcheck
+
+ long long oplogSize; // --oplogSize
+ int defaultProfile; // --profile
+ int slowMS; // --time in ms that is "slow"
+
+ int pretouch; // --pretouch for replication application (experimental)
+ bool moveParanoia; // for move chunk paranoia
+ double syncdelay; // seconds between fsyncs
+
+ bool noUnixSocket; // --nounixsocket
+ bool doFork; // --fork
+ string socket; // UNIX domain socket directory
+
+ bool keyFile;
+
+#ifndef _WIN32
+ pid_t parentProc; // --fork pid of initial process
+ pid_t leaderProc; // --fork pid of leader process
+#endif
+
+#ifdef MONGO_SSL
+ bool sslOnNormalPorts; // --sslOnNormalPorts
+ string sslPEMKeyFile; // --sslPEMKeyFile
+ string sslPEMKeyPassword; // --sslPEMKeyPassword
+
+ SSLManager* sslServerManager; // currently leaks on close
+#endif
+
+ static void launchOk();
+
+ static void addGlobalOptions( boost::program_options::options_description& general ,
+ boost::program_options::options_description& hidden );
+
+ static void addWindowsOptions( boost::program_options::options_description& windows ,
+ boost::program_options::options_description& hidden );
+
+
+ static void parseConfigFile( istream &f, stringstream &ss);
+ /**
+ * @return true if should run program, false if should exit
+ */
+ static bool store( int argc , char ** argv ,
+ boost::program_options::options_description& visible,
+ boost::program_options::options_description& hidden,
+ boost::program_options::positional_options_description& positional,
+ boost::program_options::variables_map &output );
+
+ time_t started;
+ };
+
+ // todo move to cmdline.cpp?
+ inline CmdLine::CmdLine() :
+ port(DefaultDBPort), rest(false), jsonp(false), quiet(false), noTableScan(false), prealloc(true), preallocj(true), smallfiles(sizeof(int*) == 4),
+ configsvr(false),
+ quota(false), quotaFiles(8), cpu(false), durOptions(0), objcheck(false), oplogSize(0), defaultProfile(0), slowMS(100), pretouch(0), moveParanoia( true ),
+ syncdelay(60), noUnixSocket(false), doFork(0), socket("/tmp")
+ {
+ started = time(0);
+
+ journalCommitInterval = 0; // 0 means use default
+ dur = false;
+#if defined(_DURABLEDEFAULTON)
+ dur = true;
+#endif
+ if( sizeof(void*) == 8 )
+ dur = true;
+#if defined(_DURABLEDEFAULTOFF)
+ dur = false;
+#endif
+
+#ifdef MONGO_SSL
+ sslOnNormalPorts = false;
+ sslServerManager = 0;
+#endif
+ }
+
+ extern CmdLine cmdLine;
+
+ void setupLaunchSignals();
+ void setupCoreSignals();
+
+ string prettyHostName();
+
+ void printCommandLineOpts();
+
+ /**
+ * used for setParameter command
+ * so you can write validation code that lives with code using it
+ * rather than all in the command place
+ * also lets you have mongos or mongod specific code
+ * without pulling it all sorts of things
+ */
+ class ParameterValidator {
+ public:
+ ParameterValidator( const string& name );
+ virtual ~ParameterValidator() {}
+
+ virtual bool isValid( BSONElement e , string& errmsg ) const = 0;
+
+ static ParameterValidator * get( const string& name );
+
+ private:
+ const string _name;
+ };
+
+}
+
diff --git a/src/mongo/db/collection.h b/src/mongo/db/collection.h
new file mode 100644
index 00000000000..998b2f0beac
--- /dev/null
+++ b/src/mongo/db/collection.h
@@ -0,0 +1,15 @@
+// @file collection.h
+
+#pragma once
+
+#include "namespace.h"
+
+namespace mongo {
+
+ class Collection {
+ public:
+ NamespaceDetails * const d;
+ NamespaceDetailsTransient * const nsd;
+ };
+
+}
diff --git a/src/mongo/db/commands.cpp b/src/mongo/db/commands.cpp
new file mode 100755
index 00000000000..cbe9ffc6861
--- /dev/null
+++ b/src/mongo/db/commands.cpp
@@ -0,0 +1,209 @@
+/* commands.cpp
+ db "commands" (sent via db.$cmd.findOne(...))
+ */
+
+/* Copyright 2009 10gen Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pch.h"
+#include "jsobj.h"
+#include "commands.h"
+#include "client.h"
+#include "replutil.h"
+
+namespace mongo {
+
+ map<string,Command*> * Command::_commandsByBestName;
+ map<string,Command*> * Command::_webCommands;
+ map<string,Command*> * Command::_commands;
+
+ string Command::parseNsFullyQualified(const string& dbname, const BSONObj& cmdObj) const {
+ string s = cmdObj.firstElement().valuestr();
+ NamespaceString nss(s);
+ // these are for security, do not remove:
+ verify(15966, dbname == nss.db || dbname == "admin" );
+ verify(15962, !nss.db.empty() );
+ return s;
+ }
+
+ /*virtual*/ string Command::parseNs(const string& dbname, const BSONObj& cmdObj) const {
+ string coll = cmdObj.firstElement().valuestr();
+#if defined(CLC)
+ DEV if( mongoutils::str::startsWith(coll, dbname+'.') ) {
+ log() << "DEBUG parseNs Command's collection name looks like it includes the db name\n"
+ << dbname << '\n'
+ << coll << '\n'
+ << cmdObj.toString() << endl;
+ dassert(false);
+ }
+#endif
+ return dbname + '.' + coll;
+ }
+
+ void Command::htmlHelp(stringstream& ss) const {
+ string helpStr;
+ {
+ stringstream h;
+ help(h);
+ helpStr = h.str();
+ }
+ ss << "\n<tr><td>";
+ bool web = _webCommands->count(name) != 0;
+ if( web ) ss << "<a href=\"/" << name << "?text=1\">";
+ ss << name;
+ if( web ) ss << "</a>";
+ ss << "</td>\n";
+ ss << "<td>";
+ int l = locktype();
+ //if( l == NONE ) ss << "N ";
+ if( l == READ ) ss << "R ";
+ else if( l == WRITE ) ss << "W ";
+ if( slaveOk() )
+ ss << "S ";
+ if( adminOnly() )
+ ss << "A";
+ ss << "</td>";
+ ss << "<td>";
+ if( helpStr != "no help defined" ) {
+ const char *p = helpStr.c_str();
+ while( *p ) {
+ if( *p == '<' ) {
+ ss << "&lt;";
+ p++; continue;
+ }
+ else if( *p == '{' )
+ ss << "<code>";
+ else if( *p == '}' ) {
+ ss << "}</code>";
+ p++;
+ continue;
+ }
+ if( strncmp(p, "http:", 5) == 0 ) {
+ ss << "<a href=\"";
+ const char *q = p;
+ while( *q && *q != ' ' && *q != '\n' )
+ ss << *q++;
+ ss << "\">";
+ q = p;
+ if( startsWith(q, "http://www.mongodb.org/display/") )
+ q += 31;
+ while( *q && *q != ' ' && *q != '\n' ) {
+ ss << (*q == '+' ? ' ' : *q);
+ q++;
+ if( *q == '#' )
+ while( *q && *q != ' ' && *q != '\n' ) q++;
+ }
+ ss << "</a>";
+ p = q;
+ continue;
+ }
+ if( *p == '\n' ) ss << "<br>";
+ else ss << *p;
+ p++;
+ }
+ }
+ ss << "</td>";
+ ss << "</tr>\n";
+ }
+
+ Command::Command(const char *_name, bool web, const char *oldName) : name(_name) {
+ // register ourself.
+ if ( _commands == 0 )
+ _commands = new map<string,Command*>;
+ if( _commandsByBestName == 0 )
+ _commandsByBestName = new map<string,Command*>;
+ Command*& c = (*_commands)[name];
+ if ( c )
+ log() << "warning: 2 commands with name: " << _name << endl;
+ c = this;
+ (*_commandsByBestName)[name] = this;
+
+ if( web ) {
+ if( _webCommands == 0 )
+ _webCommands = new map<string,Command*>;
+ (*_webCommands)[name] = this;
+ }
+
+ if( oldName )
+ (*_commands)[oldName] = this;
+ }
+
+ void Command::help( stringstream& help ) const {
+ help << "no help defined";
+ }
+
+ Command* Command::findCommand( const string& name ) {
+ map<string,Command*>::iterator i = _commands->find( name );
+ if ( i == _commands->end() )
+ return 0;
+ return i->second;
+ }
+
+
+ Command::LockType Command::locktype( const string& name ) {
+ Command * c = findCommand( name );
+ if ( ! c )
+ return WRITE;
+ return c->locktype();
+ }
+
+ void Command::logIfSlow( const Timer& timer, const string& msg ) {
+ int ms = timer.millis();
+ if ( ms > cmdLine.slowMS ) {
+ out() << msg << " took " << ms << " ms." << endl;
+ }
+ }
+
+}
+
+#include "../client/connpool.h"
+
+namespace mongo {
+
+ extern DBConnectionPool pool;
+
+ class PoolFlushCmd : public Command {
+ public:
+ PoolFlushCmd() : Command( "connPoolSync" , false , "connpoolsync" ) {}
+ virtual void help( stringstream &help ) const { help<<"internal"; }
+ virtual LockType locktype() const { return NONE; }
+ virtual bool run(const string&, mongo::BSONObj&, int, std::string&, mongo::BSONObjBuilder& result, bool) {
+ pool.flush();
+ return true;
+ }
+ virtual bool slaveOk() const {
+ return true;
+ }
+
+ } poolFlushCmd;
+
+ class PoolStats : public Command {
+ public:
+ PoolStats() : Command( "connPoolStats" ) {}
+ virtual void help( stringstream &help ) const { help<<"stats about connection pool"; }
+ virtual LockType locktype() const { return NONE; }
+ virtual bool run(const string&, mongo::BSONObj&, int, std::string&, mongo::BSONObjBuilder& result, bool) {
+ pool.appendInfo( result );
+ result.append( "numDBClientConnection" , DBClientConnection::getNumConnections() );
+ result.append( "numAScopedConnection" , AScopedConnection::getNumConnections() );
+ return true;
+ }
+ virtual bool slaveOk() const {
+ return true;
+ }
+
+ } poolStatsCmd;
+
+} // namespace mongo
diff --git a/src/mongo/db/commands.h b/src/mongo/db/commands.h
new file mode 100644
index 00000000000..85cdd38d7a4
--- /dev/null
+++ b/src/mongo/db/commands.h
@@ -0,0 +1,164 @@
+// commands.h
+
+/* Copyright 2009 10gen Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "jsobj.h"
+#include "../util/mongoutils/str.h"
+
+namespace mongo {
+
+ class BSONObj;
+ class BSONObjBuilder;
+ class Client;
+ class Timer;
+
+ /** mongodb "commands" (sent via db.$cmd.findOne(...))
+ subclass to make a command. define a singleton object for it.
+ */
+ class Command {
+ protected:
+ string parseNsFullyQualified(const string& dbname, const BSONObj& cmdObj) const;
+ public:
+ // only makes sense for commands where 1st parm is the collection.
+ virtual string parseNs(const string& dbname, const BSONObj& cmdObj) const;
+
+ enum LockType { READ = -1 , NONE = 0 , WRITE = 1 };
+
+ const string name;
+
+ /* run the given command
+ implement this...
+
+ fromRepl - command is being invoked as part of replication syncing. In this situation you
+ normally do not want to log the command to the local oplog.
+
+ return value is true if succeeded. if false, set errmsg text.
+ */
+ virtual bool run(const string& db, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl = false ) = 0;
+
+ /*
+ note: logTheTop() MUST be false if READ
+ if NONE, can't use Client::Context setup
+ use with caution
+ */
+ virtual LockType locktype() const = 0;
+
+ /* Return true if only the admin ns has privileges to run this command. */
+ virtual bool adminOnly() const {
+ return false;
+ }
+
+ void htmlHelp(stringstream&) const;
+
+ /* Like adminOnly, but even stricter: we must either be authenticated for admin db,
+ or, if running without auth, on the local interface. Used for things which
+ are so major that remote invocation may not make sense (e.g., shutdownServer).
+
+ When localHostOnlyIfNoAuth() is true, adminOnly() must also be true.
+ */
+ virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) { return false; }
+
+ /* Return true if slaves are allowed to execute the command
+ (the command directly from a client -- if fromRepl, always allowed).
+ */
+ virtual bool slaveOk() const = 0;
+
+ /* Return true if the client force a command to be run on a slave by
+ turning on the 'slaveOk' option in the command query.
+ */
+ virtual bool slaveOverrideOk() {
+ return false;
+ }
+
+ /* Override and return true to if true,log the operation (logOp()) to the replication log.
+ (not done if fromRepl of course)
+
+ Note if run() returns false, we do NOT log.
+ */
+ virtual bool logTheOp() { return false; }
+
+ virtual void help( stringstream& help ) const;
+
+ /* Return true if authentication and security applies to the commands. Some commands
+ (e.g., getnonce, authenticate) can be done by anyone even unauthorized.
+ */
+ virtual bool requiresAuth() { return true; }
+
+ /* Return true if a replica set secondary should go into "recovering"
+ (unreadable) state while running this command.
+ */
+ virtual bool maintenanceMode() const { return false; }
+
+ /* Return true if command should be permitted when a replica set secondary is in "recovering"
+ (unreadable) state.
+ */
+ virtual bool maintenanceOk() const { return true; /* assumed true prior to commit */ }
+
+ /** @param webUI expose the command in the web ui as localhost:28017/<name>
+ @param oldName an optional old, deprecated name for the command
+ */
+ Command(const char *_name, bool webUI = false, const char *oldName = 0);
+
+ virtual ~Command() {}
+
+ protected:
+ BSONObj getQuery( const BSONObj& cmdObj ) {
+ if ( cmdObj["query"].type() == Object )
+ return cmdObj["query"].embeddedObject();
+ if ( cmdObj["q"].type() == Object )
+ return cmdObj["q"].embeddedObject();
+ return BSONObj();
+ }
+
+ static void logIfSlow( const Timer& cmdTimer, const string& msg);
+
+ static map<string,Command*> * _commands;
+ static map<string,Command*> * _commandsByBestName;
+ static map<string,Command*> * _webCommands;
+
+ public:
+ static const map<string,Command*>* commandsByBestName() { return _commandsByBestName; }
+ static const map<string,Command*>* webCommands() { return _webCommands; }
+ /** @return if command was found and executed */
+ static bool runAgainstRegistered(const char *ns, BSONObj& jsobj, BSONObjBuilder& anObjBuilder, int queryOptions = 0);
+ static LockType locktype( const string& name );
+ static Command * findCommand( const string& name );
+ };
+
+ class CmdShutdown : public Command {
+ public:
+ virtual bool requiresAuth() { return true; }
+ virtual bool adminOnly() const { return true; }
+ virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) { return true; }
+ virtual bool logTheOp() {
+ return false;
+ }
+ virtual bool slaveOk() const {
+ return true;
+ }
+ virtual LockType locktype() const { return NONE; }
+ virtual void help( stringstream& help ) const;
+ CmdShutdown() : Command("shutdown") {}
+ bool run(const string& dbname, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl);
+ private:
+ bool shutdownHelper();
+ };
+
+ bool _runCommands(const char *ns, BSONObj& jsobj, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions);
+
+} // namespace mongo
diff --git a/src/mongo/db/commands/aggregate.js b/src/mongo/db/commands/aggregate.js
new file mode 100755
index 00000000000..7741e3121ff
--- /dev/null
+++ b/src/mongo/db/commands/aggregate.js
@@ -0,0 +1,184 @@
+/* sample aggregate command queries */
+
+// make sure we're using the right db; this is the same as "use mydb;" in shell
+db = db.getSisterDB("mydb");
+
+// just passing through fields
+var p1 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ tags : 1,
+ pageViews : 1
+ }}
+]});
+
+// unwinding an array
+var p2 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ author : 1,
+ tag : { $unwind : "tags" },
+ pageViews : 1
+ }}
+]});
+
+// pulling values out of subdocuments
+var p3 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ otherfoo : "other.foo",
+ otherbar : "other.bar"
+ }}
+]});
+
+// projection includes a computed value
+var p4 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ author : 1,
+ daveWroteIt : { $eq:["$author", "dave"] }
+ }}
+]});
+
+// projection includes a virtual (fabricated) document
+var p5 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ author : 1,
+ pageViews : 1,
+ tag : { $unwind : "tags" }
+ }},
+ { $project : {
+ author : 1,
+ subDocument : { foo : "pageViews", bar : "tag" }
+ }}
+]});
+
+// multi-step aggregate
+// nested expressions in computed fields
+var p6 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ author : 1,
+ tag : { $unwind : "tags" },
+ pageViews : 1
+ }},
+ { $project : {
+ author : 1,
+ tag : 1,
+ pageViews : 1,
+ daveWroteIt : { $eq:["$author", "dave"] },
+ weLikeIt : { $or:[ { $eq:["$author", "dave"] },
+ { $eq:["$tag", "good"] } ] }
+ }}
+]});
+
+// slightly more complex computed expression; $ifnull
+var p7 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ theSum : { $add:["$pageViews",
+ { $ifnull:["$other.foo",
+ "$other.bar"] } ] }
+ }}
+]});
+
+// dotted path inclusion; _id exclusion
+var p8 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ _id : 0,
+ author : 1,
+ tag : { $unwind : "tags" },
+ "comments.author" : 1
+ }}
+]});
+
+
+// simple matching
+var m1 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $match : { author : "dave" } }
+]});
+
+// combining matching with a projection
+var m2 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ title : 1,
+ author : 1,
+ pageViews : 1,
+ tag : { $unwind : "tags" },
+ comments : 1
+ }},
+ { $match : { tag : "nasty" } }
+]});
+
+
+// group by tag
+var g1 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ author : 1,
+ tag : { $unwind : "tags" },
+ pageViews : 1
+ }},
+ { $group : {
+ _id: { tag : 1 },
+ docsByTag : { $sum : 1 },
+ viewsByTag : { $sum : "$pageViews" }
+ }}
+]});
+
+// $max, and averaging in a final projection
+var g2 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ author : 1,
+ tag : { $unwind : "tags" },
+ pageViews : 1
+ }},
+ { $group : {
+ _id: { tag : 1 },
+ docsByTag : { $sum : 1 },
+ viewsByTag : { $sum : "$pageViews" },
+ mostViewsByTag : { $max : "$pageViews" },
+ }},
+ { $project : {
+ _id: false,
+ tag : "_id.tag",
+ mostViewsByTag : 1,
+ docsByTag : 1,
+ viewsByTag : 1,
+ avgByTag : { $divide:["$viewsByTag", "$docsByTag"] }
+ }}
+]});
+
+// $push as an accumulator; can pivot data
+var g3 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ author : 1,
+ tag : { $unwind : "tags" }
+ }},
+ { $group : {
+ _id : { tag : 1 },
+ authors : { $push : "$author" }
+ }}
+]});
+
+// $avg, and averaging in a final projection
+var g4 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ author : 1,
+ tag : { $unwind : "tags" },
+ pageViews : 1
+ }},
+ { $group : {
+ _id: { tag : 1 },
+ docsByTag : { $sum : 1 },
+ viewsByTag : { $sum : "$pageViews" },
+ avgByTag : { $avg : "$pageViews" },
+ }}
+]});
diff --git a/src/mongo/db/commands/cloud.cpp b/src/mongo/db/commands/cloud.cpp
new file mode 100644
index 00000000000..8f9d9d2e4b5
--- /dev/null
+++ b/src/mongo/db/commands/cloud.cpp
@@ -0,0 +1,90 @@
+#include "../commands.h"
+#include <map>
+#include "../../util/concurrency/value.h"
+#include "../../util/mongoutils/str.h"
+#include "../../util/net/hostandport.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+
+ mapsf<string,string> dynHostNames;
+ extern DiagStr _hostNameCached;
+
+ string dynHostMyName() {
+ if( !str::startsWith(_hostNameCached, '#') )
+ return "";
+ return _hostNameCached;
+ }
+
+ void dynHostResolve(string& name, int& port) {
+ assert( !name.empty() );
+ assert( !str::contains(name, ':') );
+ assert( str::startsWith(name, '#') );
+ string s = dynHostNames.get(name);
+ if( s.empty() ) {
+ name.clear();
+ return;
+ }
+ assert( !str::startsWith(s, '#') );
+ HostAndPort hp(s);
+ if( hp.hasPort() ) {
+ port = hp.port();
+ log() << "info: dynhost in:" << name << " out:" << hp.toString() << endl;
+ }
+ name = hp.host();
+ }
+
+ /**
+ { cloud:1, nodes: {
+ name : <ip>, ...
+ },
+ me : <mylogicalname>
+ }
+ */
+ class CmdCloud : public Command {
+ public:
+ virtual LockType locktype() const { return NONE; }
+ virtual bool logTheOp() { return false; }
+ virtual bool adminOnly() const { return true; } // very important
+ virtual bool localHostOnlyIfNoAuth(const BSONObj&) { return true; }
+ virtual bool slaveOk() const { return true; }
+ virtual void help( stringstream& help ) const {
+ help << "internal\n";
+ help << "{cloud:1,nodes:...,me:<my_logical_name>}";
+ }
+ CmdCloud() : Command("cloud") {}
+ bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ assert(!fromRepl);
+ BSONObj nodes = cmdObj["nodes"].Obj();
+ map<string,string> ipmap;
+ for( BSONObj::iterator i(nodes); i.more(); ) {
+ BSONElement e = i.next();
+ assert( *e.fieldName() == '#' );
+ ipmap[e.fieldName()] = e.String();
+ }
+
+ string me = cmdObj["me"].String();
+ assert( !me.empty() && me[0] == '#' );
+
+ log(/*1*/) << "CmdCloud" << endl;
+
+ if( me != _hostNameCached.get() ) {
+ log() << "CmdCloud new 'me' value:" << me << endl;
+ _hostNameCached = me;
+ }
+
+ dynHostNames.swap(ipmap);
+ return true;
+ }
+ } cmdCloud;
+
+ BSONObj fromjson(const string &str);
+
+ void cloudCmdLineParamIs(string cmd) {
+ string errmsg;
+ BSONObjBuilder res;
+ BSONObj o = fromjson(cmd);
+ cmdCloud.run("", o, 0, errmsg, res, false);
+ }
+}
diff --git a/src/mongo/db/commands/distinct.cpp b/src/mongo/db/commands/distinct.cpp
new file mode 100644
index 00000000000..1926e6abddb
--- /dev/null
+++ b/src/mongo/db/commands/distinct.cpp
@@ -0,0 +1,157 @@
+// distinct.cpp
+
+/**
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+//#include "pch.h"
+#include "../commands.h"
+#include "../instance.h"
+#include "../queryoptimizer.h"
+#include "../clientcursor.h"
+#include "../../util/timer.h"
+
+namespace mongo {
+
+ class DistinctCommand : public Command {
+ public:
+ DistinctCommand() : Command("distinct") {}
+ virtual bool slaveOk() const { return true; }
+ virtual LockType locktype() const { return READ; }
+ virtual void help( stringstream &help ) const {
+ help << "{ distinct : 'collection name' , key : 'a.b' , query : {} }";
+ }
+
+ bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+ Timer t;
+ string ns = dbname + '.' + cmdObj.firstElement().valuestr();
+
+ string key = cmdObj["key"].valuestrsafe();
+ BSONObj keyPattern = BSON( key << 1 );
+
+ BSONObj query = getQuery( cmdObj );
+
+ int bufSize = BSONObjMaxUserSize - 4096;
+ BufBuilder bb( bufSize );
+ char * start = bb.buf();
+
+ BSONArrayBuilder arr( bb );
+ BSONElementSet values;
+
+ long long nscanned = 0; // locations looked at
+ long long nscannedObjects = 0; // full objects looked at
+ long long n = 0; // matches
+ MatchDetails md;
+
+ NamespaceDetails * d = nsdetails( ns.c_str() );
+
+ if ( ! d ) {
+ result.appendArray( "values" , BSONObj() );
+ result.append( "stats" , BSON( "n" << 0 << "nscanned" << 0 << "nscannedObjects" << 0 ) );
+ return true;
+ }
+
+ shared_ptr<Cursor> cursor;
+ if ( ! query.isEmpty() ) {
+ cursor = NamespaceDetailsTransient::getCursor(ns.c_str() , query , BSONObj() );
+ }
+ else {
+
+ // query is empty, so lets see if we can find an index
+ // with the key so we don't have to hit the raw data
+ NamespaceDetails::IndexIterator ii = d->ii();
+ while ( ii.more() ) {
+ IndexDetails& idx = ii.next();
+
+ if ( d->isMultikey( ii.pos() - 1 ) )
+ continue;
+
+ if ( idx.inKeyPattern( key ) ) {
+ cursor = bestGuessCursor( ns.c_str() , BSONObj() , idx.keyPattern() );
+ if( cursor.get() ) break;
+ }
+
+ }
+
+ if ( ! cursor.get() )
+ cursor = NamespaceDetailsTransient::getCursor(ns.c_str() , query , BSONObj() );
+
+ }
+
+
+ assert( cursor );
+ string cursorName = cursor->toString();
+
+ auto_ptr<ClientCursor> cc (new ClientCursor(QueryOption_NoCursorTimeout, cursor, ns));
+
+ while ( cursor->ok() ) {
+ nscanned++;
+ bool loadedObject = false;
+
+ if ( cursor->currentMatches( &md ) && !cursor->getsetdup( cursor->currLoc() ) ) {
+ n++;
+
+ BSONObj holder;
+ BSONElementSet temp;
+ loadedObject = ! cc->getFieldsDotted( key , temp, holder );
+
+ for ( BSONElementSet::iterator i=temp.begin(); i!=temp.end(); ++i ) {
+ BSONElement e = *i;
+ if ( values.count( e ) )
+ continue;
+
+ int now = bb.len();
+
+ uassert(10044, "distinct too big, 16mb cap", ( now + e.size() + 1024 ) < bufSize );
+
+ arr.append( e );
+ BSONElement x( start + now );
+
+ values.insert( x );
+ }
+ }
+
+ if ( loadedObject || md._loadedObject )
+ nscannedObjects++;
+
+ cursor->advance();
+
+ if (!cc->yieldSometimes( ClientCursor::MaybeCovered )) {
+ cc.release();
+ break;
+ }
+
+ RARELY killCurrentOp.checkForInterrupt();
+ }
+
+ assert( start == bb.buf() );
+
+ result.appendArray( "values" , arr.done() );
+
+ {
+ BSONObjBuilder b;
+ b.appendNumber( "n" , n );
+ b.appendNumber( "nscanned" , nscanned );
+ b.appendNumber( "nscannedObjects" , nscannedObjects );
+ b.appendNumber( "timems" , t.millis() );
+ b.append( "cursor" , cursorName );
+ result.append( "stats" , b.obj() );
+ }
+
+ return true;
+ }
+
+ } distinctCmd;
+
+}
diff --git a/src/mongo/db/commands/document_source_cursor.cpp b/src/mongo/db/commands/document_source_cursor.cpp
new file mode 100755
index 00000000000..49bb9f19d9e
--- /dev/null
+++ b/src/mongo/db/commands/document_source_cursor.cpp
@@ -0,0 +1,100 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/cursor.h"
+#include "db/pipeline/document.h"
+
+namespace mongo {
+
+ DocumentSourceCursor::~DocumentSourceCursor() {
+ }
+
+ bool DocumentSourceCursor::eof() {
+ /* if we haven't gotten the first one yet, do so now */
+ if (!pCurrent.get())
+ findNext();
+
+ return (pCurrent.get() == NULL);
+ }
+
+ bool DocumentSourceCursor::advance() {
+ /* if we haven't gotten the first one yet, do so now */
+ if (!pCurrent.get())
+ findNext();
+
+ findNext();
+ return (pCurrent.get() != NULL);
+ }
+
+ intrusive_ptr<Document> DocumentSourceCursor::getCurrent() {
+ /* if we haven't gotten the first one yet, do so now */
+ if (!pCurrent.get())
+ findNext();
+
+ return pCurrent;
+ }
+
+ void DocumentSourceCursor::findNext() {
+ /* standard cursor usage pattern */
+ while(pCursor->ok()) {
+ CoveredIndexMatcher *pCIM; // save intermediate result
+ if ((!(pCIM = pCursor->matcher()) ||
+ pCIM->matchesCurrent(pCursor.get())) &&
+ !pCursor->getsetdup(pCursor->currLoc())) {
+
+ /* grab the matching document */
+ BSONObj documentObj(pCursor->current());
+ pCurrent = Document::createFromBsonObj(&documentObj);
+ pCursor->advance();
+ return;
+ }
+
+ pCursor->advance();
+ }
+
+ /* if we got here, there aren't any more documents */
+ pCurrent.reset();
+ }
+
+ void DocumentSourceCursor::setSource(
+ const intrusive_ptr<DocumentSource> &pSource) {
+ /* this doesn't take a source */
+ assert(false);
+ }
+
+ void DocumentSourceCursor::sourceToBson(BSONObjBuilder *pBuilder) const {
+ /* this has no analog in the BSON world */
+ assert(false);
+ }
+
+ DocumentSourceCursor::DocumentSourceCursor(
+ const shared_ptr<Cursor> &pTheCursor):
+ pCursor(pTheCursor),
+ pCurrent() {
+ }
+
+ intrusive_ptr<DocumentSourceCursor> DocumentSourceCursor::create(
+ const shared_ptr<Cursor> &pCursor) {
+ assert(pCursor.get());
+ intrusive_ptr<DocumentSourceCursor> pSource(
+ new DocumentSourceCursor(pCursor));
+ return pSource;
+ }
+}
diff --git a/src/mongo/db/commands/find_and_modify.cpp b/src/mongo/db/commands/find_and_modify.cpp
new file mode 100644
index 00000000000..0cf766fcf87
--- /dev/null
+++ b/src/mongo/db/commands/find_and_modify.cpp
@@ -0,0 +1,153 @@
+// find_and_modify.cpp
+
+/**
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../commands.h"
+#include "../instance.h"
+#include "../clientcursor.h"
+
+namespace mongo {
+
+ /* Find and Modify an object returning either the old (default) or new value*/
+ class CmdFindAndModify : public Command {
+ public:
+ virtual void help( stringstream &help ) const {
+ help <<
+ "{ findAndModify: \"collection\", query: {processed:false}, update: {$set: {processed:true}}, new: true}\n"
+ "{ findAndModify: \"collection\", query: {processed:false}, remove: true, sort: {priority:-1}}\n"
+ "Either update or remove is required, all other fields have default values.\n"
+ "Output is in the \"value\" field\n";
+ }
+
+ CmdFindAndModify() : Command("findAndModify", false, "findandmodify") { }
+ virtual bool logTheOp() { return false; } // the modifications will be logged directly
+ virtual bool slaveOk() const { return false; }
+ virtual LockType locktype() const { return WRITE; }
+ virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+ static DBDirectClient db;
+
+ string ns = dbname + '.' + cmdObj.firstElement().valuestr();
+
+ BSONObj origQuery = cmdObj.getObjectField("query"); // defaults to {}
+ Query q (origQuery);
+ BSONElement sort = cmdObj["sort"];
+ if (!sort.eoo())
+ q.sort(sort.embeddedObjectUserCheck());
+
+ bool upsert = cmdObj["upsert"].trueValue();
+
+ BSONObj fieldsHolder (cmdObj.getObjectField("fields"));
+ const BSONObj* fields = (fieldsHolder.isEmpty() ? NULL : &fieldsHolder);
+
+ Projection projection;
+ if (fields) {
+ projection.init(fieldsHolder);
+ if (!projection.includeID())
+ fields = NULL; // do projection in post-processing
+ }
+
+ BSONObj out = db.findOne(ns, q, fields);
+ if (out.isEmpty()) {
+ if (!upsert) {
+ result.appendNull("value");
+ return true;
+ }
+
+ BSONElement update = cmdObj["update"];
+ uassert(13329, "upsert mode requires update field", !update.eoo());
+ uassert(13330, "upsert mode requires query field", !origQuery.isEmpty());
+ db.update(ns, origQuery, update.embeddedObjectUserCheck(), true);
+
+ BSONObj gle = db.getLastErrorDetailed();
+ result.append("lastErrorObject", gle);
+ if (gle["err"].type() == String) {
+ errmsg = gle["err"].String();
+ return false;
+ }
+
+ if (cmdObj["new"].trueValue()) {
+ BSONElement _id = gle["upserted"];
+ if (_id.eoo())
+ _id = origQuery["_id"];
+
+ out = db.findOne(ns, QUERY("_id" << _id), fields);
+ }
+
+ }
+ else {
+
+ if (cmdObj["remove"].trueValue()) {
+ uassert(12515, "can't remove and update", cmdObj["update"].eoo());
+ db.remove(ns, QUERY("_id" << out["_id"]), 1);
+
+ BSONObj gle = db.getLastErrorDetailed();
+ result.append("lastErrorObject", gle);
+ if (gle["err"].type() == String) {
+ errmsg = gle["err"].String();
+ return false;
+ }
+
+ }
+ else { // update
+
+ BSONElement queryId = origQuery["_id"];
+ if (queryId.eoo() || getGtLtOp(queryId) != BSONObj::Equality) {
+ // need to include original query for $ positional operator
+
+ BSONObjBuilder b;
+ b.append(out["_id"]);
+ BSONObjIterator it(origQuery);
+ while (it.more()) {
+ BSONElement e = it.next();
+ if (strcmp(e.fieldName(), "_id"))
+ b.append(e);
+ }
+ q = Query(b.obj());
+ }
+
+ if (q.isComplex()) // update doesn't work with complex queries
+ q = Query(q.getFilter().getOwned());
+
+ BSONElement update = cmdObj["update"];
+ uassert(12516, "must specify remove or update", !update.eoo());
+ db.update(ns, q, update.embeddedObjectUserCheck());
+
+ BSONObj gle = db.getLastErrorDetailed();
+ result.append("lastErrorObject", gle);
+ if (gle["err"].type() == String) {
+ errmsg = gle["err"].String();
+ return false;
+ }
+
+ if (cmdObj["new"].trueValue())
+ out = db.findOne(ns, QUERY("_id" << out["_id"]), fields);
+ }
+ }
+
+ if (!fieldsHolder.isEmpty() && !fields){
+ // we need to run projection but haven't yet
+ out = projection.transform(out);
+ }
+
+ result.append("value", out);
+
+ return true;
+ }
+ } cmdFindAndModify;
+
+
+}
diff --git a/src/mongo/db/commands/group.cpp b/src/mongo/db/commands/group.cpp
new file mode 100644
index 00000000000..69fee587a47
--- /dev/null
+++ b/src/mongo/db/commands/group.cpp
@@ -0,0 +1,224 @@
+// group.cpp
+
+/**
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../commands.h"
+#include "../instance.h"
+#include "../queryoptimizer.h"
+#include "../../scripting/engine.h"
+#include "../clientcursor.h"
+
+namespace mongo {
+
+ class GroupCommand : public Command {
+ public:
+ GroupCommand() : Command("group") {}
+ virtual LockType locktype() const { return READ; }
+ virtual bool slaveOk() const { return false; }
+ virtual bool slaveOverrideOk() { return true; }
+ virtual void help( stringstream &help ) const {
+ help << "http://www.mongodb.org/display/DOCS/Aggregation";
+ }
+
+ BSONObj getKey( const BSONObj& obj , const BSONObj& keyPattern , ScriptingFunction func , double avgSize , Scope * s ) {
+ if ( func ) {
+ BSONObjBuilder b( obj.objsize() + 32 );
+ b.append( "0" , obj );
+ const BSONObj& key = b.obj();
+ int res = s->invoke( func , &key, 0 );
+ uassert( 10041 , (string)"invoke failed in $keyf: " + s->getError() , res == 0 );
+ int type = s->type("return");
+ uassert( 10042 , "return of $key has to be an object" , type == Object );
+ return s->getObject( "return" );
+ }
+ return obj.extractFields( keyPattern , true ).getOwned();
+ }
+
+ bool group( string realdbname , const string& ns , const BSONObj& query ,
+ BSONObj keyPattern , string keyFunctionCode , string reduceCode , const char * reduceScope ,
+ BSONObj initial , string finalize ,
+ string& errmsg , BSONObjBuilder& result ) {
+
+
+ auto_ptr<Scope> s = globalScriptEngine->getPooledScope( realdbname );
+ s->localConnect( realdbname.c_str() );
+
+ if ( reduceScope )
+ s->init( reduceScope );
+
+ s->setObject( "$initial" , initial , true );
+
+ s->exec( "$reduce = " + reduceCode , "reduce setup" , false , true , true , 100 );
+ s->exec( "$arr = [];" , "reduce setup 2" , false , true , true , 100 );
+ ScriptingFunction f = s->createFunction(
+ "function(){ "
+ " if ( $arr[n] == null ){ "
+ " next = {}; "
+ " Object.extend( next , $key ); "
+ " Object.extend( next , $initial , true ); "
+ " $arr[n] = next; "
+ " next = null; "
+ " } "
+ " $reduce( obj , $arr[n] ); "
+ "}" );
+
+ ScriptingFunction keyFunction = 0;
+ if ( keyFunctionCode.size() ) {
+ keyFunction = s->createFunction( keyFunctionCode.c_str() );
+ }
+
+
+ double keysize = keyPattern.objsize() * 3;
+ double keynum = 1;
+
+ map<BSONObj,int,BSONObjCmp> map;
+ list<BSONObj> blah;
+
+ shared_ptr<Cursor> cursor = NamespaceDetailsTransient::getCursor(ns.c_str() , query);
+ ClientCursor::CleanupPointer ccPointer;
+ ccPointer.reset( new ClientCursor( QueryOption_NoCursorTimeout, cursor, ns ) );
+
+ while ( cursor->ok() ) {
+
+ if ( !ccPointer->yieldSometimes( ClientCursor::MaybeCovered ) ||
+ !cursor->ok() ) {
+ break;
+ }
+
+ if ( !cursor->currentMatches() || cursor->getsetdup( cursor->currLoc() ) ) {
+ cursor->advance();
+ continue;
+ }
+
+ if ( !ccPointer->yieldSometimes( ClientCursor::WillNeed ) ||
+ !cursor->ok() ) {
+ break;
+ }
+
+ BSONObj obj = cursor->current();
+ cursor->advance();
+
+ BSONObj key = getKey( obj , keyPattern , keyFunction , keysize / keynum , s.get() );
+ keysize += key.objsize();
+ keynum++;
+
+ int& n = map[key];
+ if ( n == 0 ) {
+ n = map.size();
+ s->setObject( "$key" , key , true );
+
+ uassert( 10043 , "group() can't handle more than 20000 unique keys" , n <= 20000 );
+ }
+
+ s->setObject( "obj" , obj , true );
+ s->setNumber( "n" , n - 1 );
+ if ( s->invoke( f , 0, 0 , 0 , true ) ) {
+ throw UserException( 9010 , (string)"reduce invoke failed: " + s->getError() );
+ }
+ }
+ ccPointer.reset();
+
+ if (!finalize.empty()) {
+ s->exec( "$finalize = " + finalize , "finalize define" , false , true , true , 100 );
+ ScriptingFunction g = s->createFunction(
+ "function(){ "
+ " for(var i=0; i < $arr.length; i++){ "
+ " var ret = $finalize($arr[i]); "
+ " if (ret !== undefined) "
+ " $arr[i] = ret; "
+ " } "
+ "}" );
+ s->invoke( g , 0, 0 , 0 , true );
+ }
+
+ result.appendArray( "retval" , s->getObject( "$arr" ) );
+ result.append( "count" , keynum - 1 );
+ result.append( "keys" , (int)(map.size()) );
+ s->exec( "$arr = [];" , "reduce setup 2" , false , true , true , 100 );
+ s->gc();
+
+ return true;
+ }
+
+ bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+
+ if ( !globalScriptEngine ) {
+ errmsg = "server-side JavaScript execution is disabled";
+ return false;
+ }
+
+ /* db.$cmd.findOne( { group : <p> } ) */
+ const BSONObj& p = jsobj.firstElement().embeddedObjectUserCheck();
+
+ BSONObj q;
+ if ( p["cond"].type() == Object )
+ q = p["cond"].embeddedObject();
+ else if ( p["condition"].type() == Object )
+ q = p["condition"].embeddedObject();
+ else
+ q = getQuery( p );
+
+ if ( p["ns"].type() != String ) {
+ errmsg = "ns has to be set";
+ return false;
+ }
+
+ string ns = dbname + "." + p["ns"].String();
+
+ BSONObj key;
+ string keyf;
+ if ( p["key"].type() == Object ) {
+ key = p["key"].embeddedObjectUserCheck();
+ if ( ! p["$keyf"].eoo() ) {
+ errmsg = "can't have key and $keyf";
+ return false;
+ }
+ }
+ else if ( p["$keyf"].type() ) {
+ keyf = p["$keyf"]._asCode();
+ }
+ else {
+ // no key specified, will use entire object as key
+ }
+
+ BSONElement reduce = p["$reduce"];
+ if ( reduce.eoo() ) {
+ errmsg = "$reduce has to be set";
+ return false;
+ }
+
+ BSONElement initial = p["initial"];
+ if ( initial.type() != Object ) {
+ errmsg = "initial has to be an object";
+ return false;
+ }
+
+
+ string finalize;
+ if (p["finalize"].type())
+ finalize = p["finalize"]._asCode();
+
+ return group( dbname , ns , q ,
+ key , keyf , reduce._asCode() , reduce.type() != CodeWScope ? 0 : reduce.codeWScopeScopeData() ,
+ initial.embeddedObject() , finalize ,
+ errmsg , result );
+ }
+
+ } cmdGroup;
+
+
+} // namespace mongo
diff --git a/src/mongo/db/commands/isself.cpp b/src/mongo/db/commands/isself.cpp
new file mode 100644
index 00000000000..ebf6d5bceec
--- /dev/null
+++ b/src/mongo/db/commands/isself.cpp
@@ -0,0 +1,246 @@
+// isself.cpp
+
+#include "pch.h"
+#include "../../util/net/listen.h"
+#include "../commands.h"
+#include "../../client/dbclient.h"
+#include "../security.h"
+
+#include <boost/algorithm/string.hpp>
+
+#ifndef _WIN32
+# ifndef __sunos__
+# include <ifaddrs.h>
+# endif
+# include <sys/resource.h>
+# include <sys/stat.h>
+
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <errno.h>
+#include <netdb.h>
+#ifdef __openbsd__
+# include <sys/uio.h>
+#endif
+
+#endif
+
+
+namespace mongo {
+
+#if !defined(_WIN32) && !defined(__sunos__)
+
+ vector<string> getMyAddrs() {
+ vector<string> out;
+ ifaddrs * addrs;
+
+ if ( ! cmdLine.bind_ip.empty() ) {
+ boost::split( out, cmdLine.bind_ip, boost::is_any_of( ", " ) );
+ return out;
+ }
+
+ int status = getifaddrs(&addrs);
+ massert(13469, "getifaddrs failure: " + errnoWithDescription(errno), status == 0);
+
+ // based on example code from linux getifaddrs manpage
+ for (ifaddrs * addr = addrs; addr != NULL; addr = addr->ifa_next) {
+ if ( addr->ifa_addr == NULL ) continue;
+ int family = addr->ifa_addr->sa_family;
+ char host[NI_MAXHOST];
+
+ if (family == AF_INET || family == AF_INET6) {
+ status = getnameinfo(addr->ifa_addr,
+ (family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6)),
+ host, NI_MAXHOST, NULL, 0, NI_NUMERICHOST);
+ if ( status != 0 ) {
+ freeifaddrs( addrs );
+ addrs = NULL;
+ msgasserted( 13470, string("getnameinfo() failed: ") + gai_strerror(status) );
+ }
+
+ out.push_back(host);
+ }
+
+ }
+
+ freeifaddrs( addrs );
+ addrs = NULL;
+
+ if (logLevel >= 1) {
+ log(1) << "getMyAddrs():";
+ for (vector<string>::const_iterator it=out.begin(), end=out.end(); it!=end; ++it) {
+ log(1) << " [" << *it << ']';
+ }
+ log(1) << endl;
+ }
+
+ return out;
+ }
+
+ vector<string> getAllIPs(StringData iporhost) {
+ addrinfo* addrs = NULL;
+ addrinfo hints;
+ memset(&hints, 0, sizeof(addrinfo));
+ hints.ai_socktype = SOCK_STREAM;
+ hints.ai_family = (IPv6Enabled() ? AF_UNSPEC : AF_INET);
+
+ static string portNum = BSONObjBuilder::numStr(cmdLine.port);
+
+ vector<string> out;
+
+ int ret = getaddrinfo(iporhost.data(), portNum.c_str(), &hints, &addrs);
+ if ( ret ) {
+ warning() << "getaddrinfo(\"" << iporhost.data() << "\") failed: " << gai_strerror(ret) << endl;
+ return out;
+ }
+
+ for (addrinfo* addr = addrs; addr != NULL; addr = addr->ai_next) {
+ int family = addr->ai_family;
+ char host[NI_MAXHOST];
+
+ if (family == AF_INET || family == AF_INET6) {
+ int status = getnameinfo(addr->ai_addr, addr->ai_addrlen, host, NI_MAXHOST, NULL, 0, NI_NUMERICHOST);
+
+ massert(13472, string("getnameinfo() failed: ") + gai_strerror(status), status == 0);
+
+ out.push_back(host);
+ }
+
+ }
+
+ freeaddrinfo(addrs);
+
+ if (logLevel >= 1) {
+ log(1) << "getallIPs(\"" << iporhost << "\"):";
+ for (vector<string>::const_iterator it=out.begin(), end=out.end(); it!=end; ++it) {
+ log(1) << " [" << *it << ']';
+ }
+ log(1) << endl;
+ }
+
+ return out;
+ }
+#endif
+
+
+ class IsSelfCommand : public Command {
+ public:
+ IsSelfCommand() : Command("_isSelf") , _cacheLock( "IsSelfCommand::_cacheLock" ) {}
+ virtual bool slaveOk() const { return true; }
+ virtual LockType locktype() const { return NONE; }
+ virtual void help( stringstream &help ) const {
+ help << "{ _isSelf : 1 } INTERNAL ONLY";
+ }
+
+ bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+ init();
+ result.append( "id" , _id );
+ return true;
+ }
+
+ void init() {
+ scoped_lock lk( _cacheLock );
+ if ( ! _id.isSet() )
+ _id.init();
+ }
+
+ OID _id;
+
+ mongo::mutex _cacheLock;
+ map<string,bool> _cache;
+ } isSelfCommand;
+
+ bool HostAndPort::isSelf() const {
+
+ if( dyn() ) {
+ LOG(2) << "isSelf " << _dynName << ' ' << dynHostMyName() << endl;
+ return dynHostMyName() == _dynName;
+ }
+
+ int _p = port();
+ int p = _p == -1 ? CmdLine::DefaultDBPort : _p;
+
+ if( p != cmdLine.port ) {
+ // shortcut - ports have to match at the very least
+ return false;
+ }
+
+ string host = str::stream() << this->host() << ":" << p;
+
+ {
+ // check cache for this host
+ // debatably something _could_ change, but I'm not sure right now (erh 10/14/2010)
+ scoped_lock lk( isSelfCommand._cacheLock );
+ map<string,bool>::const_iterator i = isSelfCommand._cache.find( host );
+ if ( i != isSelfCommand._cache.end() )
+ return i->second;
+ }
+
+#if !defined(_WIN32) && !defined(__sunos__)
+ // on linux and os x we can do a quick check for an ip match
+
+ const vector<string> myaddrs = getMyAddrs();
+ const vector<string> addrs = getAllIPs(_host);
+
+ for (vector<string>::const_iterator i=myaddrs.begin(), iend=myaddrs.end(); i!=iend; ++i) {
+ for (vector<string>::const_iterator j=addrs.begin(), jend=addrs.end(); j!=jend; ++j) {
+ string a = *i;
+ string b = *j;
+
+ if ( a == b ||
+ ( str::startsWith( a , "127." ) && str::startsWith( b , "127." ) ) // 127. is all loopback
+ ) {
+
+ // add to cache
+ scoped_lock lk( isSelfCommand._cacheLock );
+ isSelfCommand._cache[host] = true;
+ return true;
+ }
+ }
+ }
+
+#endif
+
+ if ( ! Listener::getTimeTracker() ) {
+ // this ensures we are actually running a server
+ // this may return true later, so may want to retry
+ return false;
+ }
+
+ try {
+ isSelfCommand.init();
+ DBClientConnection conn;
+ string errmsg;
+ if ( ! conn.connect( host , errmsg ) ) {
+ // should this go in the cache?
+ return false;
+ }
+
+ if (!noauth && cmdLine.keyFile &&
+ !conn.auth("local", internalSecurity.user, internalSecurity.pwd, errmsg, false)) {
+ return false;
+ }
+
+ BSONObj out;
+ bool ok = conn.simpleCommand( "admin" , &out , "_isSelf" );
+ bool me = ok && out["id"].type() == jstOID && isSelfCommand._id == out["id"].OID();
+
+ // add to cache
+ scoped_lock lk( isSelfCommand._cacheLock );
+ isSelfCommand._cache[host] = me;
+
+ return me;
+ }
+ catch ( std::exception& e ) {
+ warning() << "could't check isSelf (" << host << ") " << e.what() << endl;
+ }
+
+ return false;
+ }
+
+}
diff --git a/src/mongo/db/commands/mr.cpp b/src/mongo/db/commands/mr.cpp
new file mode 100644
index 00000000000..add76c39c47
--- /dev/null
+++ b/src/mongo/db/commands/mr.cpp
@@ -0,0 +1,1317 @@
+// mr.cpp
+
+/**
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../db.h"
+#include "../instance.h"
+#include "../commands.h"
+#include "../../scripting/engine.h"
+#include "../../client/dbclient.h"
+#include "../../client/connpool.h"
+#include "../../client/parallel.h"
+#include "../queryoptimizer.h"
+#include "../matcher.h"
+#include "../clientcursor.h"
+#include "../replutil.h"
+#include "../../s/d_chunk_manager.h"
+#include "../../s/d_logic.h"
+#include "../../s/grid.h"
+
+#include "mr.h"
+
+namespace mongo {
+
+ namespace mr {
+
+ AtomicUInt Config::JOB_NUMBER;
+
+ JSFunction::JSFunction( string type , const BSONElement& e ) {
+ _type = type;
+ _code = e._asCode();
+
+ if ( e.type() == CodeWScope )
+ _wantedScope = e.codeWScopeObject();
+ }
+
+ void JSFunction::init( State * state ) {
+ _scope = state->scope();
+ assert( _scope );
+ _scope->init( &_wantedScope );
+
+ _func = _scope->createFunction( _code.c_str() );
+ uassert( 13598 , str::stream() << "couldn't compile code for: " << _type , _func );
+
+ // install in JS scope so that it can be called in JS mode
+ _scope->setFunction(_type.c_str(), _code.c_str());
+ }
+
+ void JSMapper::init( State * state ) {
+ _func.init( state );
+ _params = state->config().mapParams;
+ }
+
+ /**
+ * Applies the map function to an object, which should internally call emit()
+ */
+ void JSMapper::map( const BSONObj& o ) {
+ Scope * s = _func.scope();
+ assert( s );
+ if ( s->invoke( _func.func() , &_params, &o , 0 , true, false, true ) )
+ throw UserException( 9014, str::stream() << "map invoke failed: " + s->getError() );
+ }
+
+ /**
+ * Applies the finalize function to a tuple obj (key, val)
+ * Returns tuple obj {_id: key, value: newval}
+ */
+ BSONObj JSFinalizer::finalize( const BSONObj& o ) {
+ Scope * s = _func.scope();
+
+ Scope::NoDBAccess no = s->disableDBAccess( "can't access db inside finalize" );
+ s->invokeSafe( _func.func() , &o, 0 );
+
+ // don't want to use o.objsize() to size b
+ // since there are many cases where the point of finalize
+ // is converting many fields to 1
+ BSONObjBuilder b;
+ b.append( o.firstElement() );
+ s->append( b , "value" , "return" );
+ return b.obj();
+ }
+
+ void JSReducer::init( State * state ) {
+ _func.init( state );
+ }
+
+ /**
+ * Reduces a list of tuple objects (key, value) to a single tuple {"0": key, "1": value}
+ */
+ BSONObj JSReducer::reduce( const BSONList& tuples ) {
+ if (tuples.size() <= 1)
+ return tuples[0];
+ BSONObj key;
+ int endSizeEstimate = 16;
+ _reduce( tuples , key , endSizeEstimate );
+
+ BSONObjBuilder b(endSizeEstimate);
+ b.appendAs( key.firstElement() , "0" );
+ _func.scope()->append( b , "1" , "return" );
+ return b.obj();
+ }
+
+ /**
+ * Reduces a list of tuple object (key, value) to a single tuple {_id: key, value: val}
+ * Also applies a finalizer method if present.
+ */
+ BSONObj JSReducer::finalReduce( const BSONList& tuples , Finalizer * finalizer ) {
+
+ BSONObj res;
+ BSONObj key;
+
+ if (tuples.size() == 1) {
+ // 1 obj, just use it
+ key = tuples[0];
+ BSONObjBuilder b(key.objsize());
+ BSONObjIterator it(key);
+ b.appendAs( it.next() , "_id" );
+ b.appendAs( it.next() , "value" );
+ res = b.obj();
+ }
+ else {
+ // need to reduce
+ int endSizeEstimate = 16;
+ _reduce( tuples , key , endSizeEstimate );
+ BSONObjBuilder b(endSizeEstimate);
+ b.appendAs( key.firstElement() , "_id" );
+ _func.scope()->append( b , "value" , "return" );
+ res = b.obj();
+ }
+
+ if ( finalizer ) {
+ res = finalizer->finalize( res );
+ }
+
+ return res;
+ }
+
+ /**
+ * actually applies a reduce, to a list of tuples (key, value).
+ * After the call, tuples will hold a single tuple {"0": key, "1": value}
+ */
+ void JSReducer::_reduce( const BSONList& tuples , BSONObj& key , int& endSizeEstimate ) {
+ uassert( 10074 , "need values" , tuples.size() );
+
+ int sizeEstimate = ( tuples.size() * tuples.begin()->getField( "value" ).size() ) + 128;
+
+ // need to build the reduce args: ( key, [values] )
+ BSONObjBuilder reduceArgs( sizeEstimate );
+ boost::scoped_ptr<BSONArrayBuilder> valueBuilder;
+ int sizeSoFar = 0;
+ unsigned n = 0;
+ for ( ; n<tuples.size(); n++ ) {
+ BSONObjIterator j(tuples[n]);
+ BSONElement keyE = j.next();
+ if ( n == 0 ) {
+ reduceArgs.append( keyE );
+ key = keyE.wrap();
+ sizeSoFar = 5 + keyE.size();
+ valueBuilder.reset(new BSONArrayBuilder( reduceArgs.subarrayStart( "tuples" ) ));
+ }
+
+ BSONElement ee = j.next();
+
+ uassert( 13070 , "value too large to reduce" , ee.size() < ( BSONObjMaxUserSize / 2 ) );
+
+ if ( sizeSoFar + ee.size() > BSONObjMaxUserSize ) {
+ assert( n > 1 ); // if not, inf. loop
+ break;
+ }
+
+ valueBuilder->append( ee );
+ sizeSoFar += ee.size();
+ }
+ assert(valueBuilder);
+ valueBuilder->done();
+ BSONObj args = reduceArgs.obj();
+
+ Scope * s = _func.scope();
+
+ s->invokeSafe( _func.func() , &args, 0, 0, false, true, true );
+ ++numReduces;
+
+ if ( s->type( "return" ) == Array ) {
+ uasserted( 10075 , "reduce -> multiple not supported yet");
+ return;
+ }
+
+ endSizeEstimate = key.objsize() + ( args.objsize() / tuples.size() );
+
+ if ( n == tuples.size() )
+ return;
+
+ // the input list was too large, add the rest of elmts to new tuples and reduce again
+ // note: would be better to use loop instead of recursion to avoid stack overflow
+ BSONList x;
+ for ( ; n < tuples.size(); n++ ) {
+ x.push_back( tuples[n] );
+ }
+ BSONObjBuilder temp( endSizeEstimate );
+ temp.append( key.firstElement() );
+ s->append( temp , "1" , "return" );
+ x.push_back( temp.obj() );
+ _reduce( x , key , endSizeEstimate );
+ }
+
+ Config::Config( const string& _dbname , const BSONObj& cmdObj ) {
+
+ dbname = _dbname;
+ ns = dbname + "." + cmdObj.firstElement().valuestr();
+
+ verbose = cmdObj["verbose"].trueValue();
+ jsMode = cmdObj["jsMode"].trueValue();
+ splitInfo = 0;
+ if (cmdObj.hasField("splitInfo"))
+ splitInfo = cmdObj["splitInfo"].Int();
+
+ jsMaxKeys = 500000;
+ reduceTriggerRatio = 10.0;
+ maxInMemSize = 500 * 1024;
+
+ uassert( 13602 , "outType is no longer a valid option" , cmdObj["outType"].eoo() );
+
+ if ( cmdObj["out"].type() == String ) {
+ finalShort = cmdObj["out"].String();
+ outType = REPLACE;
+ }
+ else if ( cmdObj["out"].type() == Object ) {
+ BSONObj o = cmdObj["out"].embeddedObject();
+
+ BSONElement e = o.firstElement();
+ string t = e.fieldName();
+
+ if ( t == "normal" || t == "replace" ) {
+ outType = REPLACE;
+ finalShort = e.String();
+ }
+ else if ( t == "merge" ) {
+ outType = MERGE;
+ finalShort = e.String();
+ }
+ else if ( t == "reduce" ) {
+ outType = REDUCE;
+ finalShort = e.String();
+ }
+ else if ( t == "inline" ) {
+ outType = INMEMORY;
+ }
+ else {
+ uasserted( 13522 , str::stream() << "unknown out specifier [" << t << "]" );
+ }
+
+ if (o.hasElement("db")) {
+ outDB = o["db"].String();
+ }
+
+ if (o.hasElement("nonAtomic")) {
+ outNonAtomic = o["nonAtomic"].Bool();
+ if (outNonAtomic)
+ uassert( 15895 , "nonAtomic option cannot be used with this output type", (outType == REDUCE || outType == MERGE) );
+ }
+ }
+ else {
+ uasserted( 13606 , "'out' has to be a string or an object" );
+ }
+
+ if ( outType != INMEMORY ) { // setup names
+ tempLong = str::stream() << (outDB.empty() ? dbname : outDB) << ".tmp.mr." << cmdObj.firstElement().String() << "_" << JOB_NUMBER++;
+
+ incLong = tempLong + "_inc";
+
+ finalLong = str::stream() << (outDB.empty() ? dbname : outDB) << "." << finalShort;
+ }
+
+ {
+ // scope and code
+
+ if ( cmdObj["scope"].type() == Object )
+ scopeSetup = cmdObj["scope"].embeddedObjectUserCheck();
+
+ mapper.reset( new JSMapper( cmdObj["map"] ) );
+ reducer.reset( new JSReducer( cmdObj["reduce"] ) );
+ if ( cmdObj["finalize"].type() && cmdObj["finalize"].trueValue() )
+ finalizer.reset( new JSFinalizer( cmdObj["finalize"] ) );
+
+ if ( cmdObj["mapparams"].type() == Array ) {
+ mapParams = cmdObj["mapparams"].embeddedObjectUserCheck();
+ }
+
+ }
+
+ {
+ // query options
+ BSONElement q = cmdObj["query"];
+ if ( q.type() == Object )
+ filter = q.embeddedObjectUserCheck();
+ else
+ uassert( 13608 , "query has to be blank or an Object" , ! q.trueValue() );
+
+
+ BSONElement s = cmdObj["sort"];
+ if ( s.type() == Object )
+ sort = s.embeddedObjectUserCheck();
+ else
+ uassert( 13609 , "sort has to be blank or an Object" , ! s.trueValue() );
+
+ if ( cmdObj["limit"].isNumber() )
+ limit = cmdObj["limit"].numberLong();
+ else
+ limit = 0;
+ }
+ }
+
+ /**
+ * Create temporary collection, set up indexes
+ */
+ void State::prepTempCollection() {
+ if ( ! _onDisk )
+ return;
+
+ if (_config.incLong != _config.tempLong) {
+ // create the inc collection and make sure we have index on "0" key
+ _db.dropCollection( _config.incLong );
+ {
+ writelock l( _config.incLong );
+ Client::Context ctx( _config.incLong );
+ string err;
+ if ( ! userCreateNS( _config.incLong.c_str() , BSON( "autoIndexId" << 0 ) , err , false ) ) {
+ uasserted( 13631 , str::stream() << "userCreateNS failed for mr incLong ns: " << _config.incLong << " err: " << err );
+ }
+ }
+
+ BSONObj sortKey = BSON( "0" << 1 );
+ _db.ensureIndex( _config.incLong , sortKey );
+ }
+
+ // create temp collection
+ _db.dropCollection( _config.tempLong );
+ {
+ writelock lock( _config.tempLong.c_str() );
+ Client::Context ctx( _config.tempLong.c_str() );
+ string errmsg;
+ if ( ! userCreateNS( _config.tempLong.c_str() , BSONObj() , errmsg , true ) ) {
+ uasserted( 13630 , str::stream() << "userCreateNS failed for mr tempLong ns: " << _config.tempLong << " err: " << errmsg );
+ }
+ }
+
+ {
+ // copy indexes
+ auto_ptr<DBClientCursor> idx = _db.getIndexes( _config.finalLong );
+ while ( idx->more() ) {
+ BSONObj i = idx->next();
+
+ BSONObjBuilder b( i.objsize() + 16 );
+ b.append( "ns" , _config.tempLong );
+ BSONObjIterator j( i );
+ while ( j.more() ) {
+ BSONElement e = j.next();
+ if ( str::equals( e.fieldName() , "_id" ) ||
+ str::equals( e.fieldName() , "ns" ) )
+ continue;
+
+ b.append( e );
+ }
+
+ BSONObj indexToInsert = b.obj();
+ insert( Namespace( _config.tempLong.c_str() ).getSisterNS( "system.indexes" ).c_str() , indexToInsert );
+ }
+
+ }
+
+ }
+
+ /**
+ * For inline mode, appends results to output object.
+ * Makes sure (key, value) tuple is formatted as {_id: key, value: val}
+ */
+ void State::appendResults( BSONObjBuilder& final ) {
+ if ( _onDisk ) {
+ if (!_config.outDB.empty()) {
+ BSONObjBuilder loc;
+ if ( !_config.outDB.empty())
+ loc.append( "db" , _config.outDB );
+ if ( !_config.finalShort.empty() )
+ loc.append( "collection" , _config.finalShort );
+ final.append("result", loc.obj());
+ }
+ else {
+ if ( !_config.finalShort.empty() )
+ final.append( "result" , _config.finalShort );
+ }
+
+ if ( _config.splitInfo > 0 ) {
+ // add split points, used for shard
+ BSONObj res;
+ BSONObj idKey = BSON( "_id" << 1 );
+ if ( ! _db.runCommand( "admin" , BSON( "splitVector" << _config.finalLong << "keyPattern" << idKey << "maxChunkSizeBytes" << _config.splitInfo ) , res ) ) {
+ uasserted( 15921 , str::stream() << "splitVector failed: " << res );
+ }
+ if ( res.hasField( "splitKeys" ) )
+ final.append( res.getField( "splitKeys" ) );
+ }
+ return;
+ }
+
+ if (_jsMode) {
+ ScriptingFunction getResult = _scope->createFunction("var map = _mrMap; var result = []; for (key in map) { result.push({_id: key, value: map[key]}) } return result;");
+ _scope->invoke(getResult, 0, 0, 0, false);
+ BSONObj obj = _scope->getObject("return");
+ final.append("results", BSONArray(obj));
+ return;
+ }
+
+ uassert( 13604 , "too much data for in memory map/reduce" , _size < BSONObjMaxUserSize );
+
+ BSONArrayBuilder b( (int)(_size * 1.2) ); // _size is data size, doesn't count overhead and keys
+
+ for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); ++i ) {
+ BSONObj key = i->first;
+ BSONList& all = i->second;
+
+ assert( all.size() == 1 );
+
+ BSONObjIterator vi( all[0] );
+ vi.next();
+
+ BSONObjBuilder temp( b.subobjStart() );
+ temp.appendAs( key.firstElement() , "_id" );
+ temp.appendAs( vi.next() , "value" );
+ temp.done();
+ }
+
+ BSONArray res = b.arr();
+ final.append( "results" , res );
+ }
+
+ /**
+ * Does post processing on output collection.
+ * This may involve replacing, merging or reducing.
+ */
+ long long State::postProcessCollection(CurOp* op, ProgressMeterHolder& pm) {
+ if ( _onDisk == false || _config.outType == Config::INMEMORY )
+ return _temp->size();
+
+ if (_config.outNonAtomic)
+ return postProcessCollectionNonAtomic(op, pm);
+ writelock lock;
+ return postProcessCollectionNonAtomic(op, pm);
+ }
+
+ long long State::postProcessCollectionNonAtomic(CurOp* op, ProgressMeterHolder& pm) {
+
+ if ( _config.finalLong == _config.tempLong )
+ return _db.count( _config.finalLong );
+
+ if ( _config.outType == Config::REPLACE || _db.count( _config.finalLong ) == 0 ) {
+ writelock lock;
+ // replace: just rename from temp to final collection name, dropping previous collection
+ _db.dropCollection( _config.finalLong );
+ BSONObj info;
+ if ( ! _db.runCommand( "admin" , BSON( "renameCollection" << _config.tempLong << "to" << _config.finalLong ) , info ) ) {
+ uasserted( 10076 , str::stream() << "rename failed: " << info );
+ }
+
+ _db.dropCollection( _config.tempLong );
+ }
+ else if ( _config.outType == Config::MERGE ) {
+ // merge: upsert new docs into old collection
+ op->setMessage( "m/r: merge post processing" , _db.count( _config.tempLong, BSONObj() ) );
+ auto_ptr<DBClientCursor> cursor = _db.query( _config.tempLong , BSONObj() );
+ while ( cursor->more() ) {
+ writelock lock;
+ BSONObj o = cursor->next();
+ Helpers::upsert( _config.finalLong , o );
+ getDur().commitIfNeeded();
+ pm.hit();
+ }
+ _db.dropCollection( _config.tempLong );
+ pm.finished();
+ }
+ else if ( _config.outType == Config::REDUCE ) {
+ // reduce: apply reduce op on new result and existing one
+ BSONList values;
+
+ op->setMessage( "m/r: reduce post processing" , _db.count( _config.tempLong, BSONObj() ) );
+ auto_ptr<DBClientCursor> cursor = _db.query( _config.tempLong , BSONObj() );
+ while ( cursor->more() ) {
+ writelock lock;
+ BSONObj temp = cursor->next();
+ BSONObj old;
+
+ bool found;
+ {
+ Client::Context tx( _config.finalLong );
+ found = Helpers::findOne( _config.finalLong.c_str() , temp["_id"].wrap() , old , true );
+ }
+
+ if ( found ) {
+ // need to reduce
+ values.clear();
+ values.push_back( temp );
+ values.push_back( old );
+ Helpers::upsert( _config.finalLong , _config.reducer->finalReduce( values , _config.finalizer.get() ) );
+ }
+ else {
+ Helpers::upsert( _config.finalLong , temp );
+ }
+ getDur().commitIfNeeded();
+ pm.hit();
+ }
+ _db.dropCollection( _config.tempLong );
+ pm.finished();
+ }
+
+ return _db.count( _config.finalLong );
+ }
+
+ /**
+ * Insert doc in collection
+ */
+ void State::insert( const string& ns , const BSONObj& o ) {
+ assert( _onDisk );
+
+ writelock l( ns );
+ Client::Context ctx( ns );
+
+ theDataFileMgr.insertAndLog( ns.c_str() , o , false );
+ }
+
+ /**
+ * Insert doc into the inc collection, taking proper lock
+ */
+ void State::insertToInc( BSONObj& o ) {
+ writelock l(_config.incLong);
+ Client::Context ctx(_config.incLong);
+ _insertToInc(o);
+ }
+
+ /**
+ * Insert doc into the inc collection
+ */
+ void State::_insertToInc( BSONObj& o ) {
+ assert( _onDisk );
+ theDataFileMgr.insertWithObjMod( _config.incLong.c_str() , o , true );
+ getDur().commitIfNeeded();
+ }
+
+ State::State( const Config& c ) : _config( c ), _size(0), _dupCount(0), _numEmits(0) {
+ _temp.reset( new InMemory() );
+ _onDisk = _config.outType != Config::INMEMORY;
+ }
+
+ bool State::sourceExists() {
+ return _db.exists( _config.ns );
+ }
+
+ long long State::incomingDocuments() {
+ return _db.count( _config.ns , _config.filter , QueryOption_SlaveOk , (unsigned) _config.limit );
+ }
+
+ State::~State() {
+ if ( _onDisk ) {
+ try {
+ _db.dropCollection( _config.tempLong );
+ _db.dropCollection( _config.incLong );
+ }
+ catch ( std::exception& e ) {
+ error() << "couldn't cleanup after map reduce: " << e.what() << endl;
+ }
+ }
+
+ if (_scope) {
+ // cleanup js objects
+ ScriptingFunction cleanup = _scope->createFunction("delete _emitCt; delete _keyCt; delete _mrMap;");
+ _scope->invoke(cleanup, 0, 0, 0, true);
+ }
+ }
+
+ /**
+ * Initialize the mapreduce operation, creating the inc collection
+ */
+ void State::init() {
+ // setup js
+ _scope.reset(globalScriptEngine->getPooledScope( _config.dbname ).release() );
+ _scope->localConnect( _config.dbname.c_str() );
+
+ if ( ! _config.scopeSetup.isEmpty() )
+ _scope->init( &_config.scopeSetup );
+
+ _config.mapper->init( this );
+ _config.reducer->init( this );
+ if ( _config.finalizer )
+ _config.finalizer->init( this );
+ _scope->setBoolean("_doFinal", _config.finalizer);
+
+ // by default start in JS mode, will be faster for small jobs
+ _jsMode = _config.jsMode;
+// _jsMode = true;
+ switchMode(_jsMode);
+
+ // global JS map/reduce hashmap
+ // we use a standard JS object which means keys are only simple types
+ // we could also add a real hashmap from a library, still we need to add object comparison methods
+// _scope->setObject("_mrMap", BSONObj(), false);
+ ScriptingFunction init = _scope->createFunction("_emitCt = 0; _keyCt = 0; _dupCt = 0; _redCt = 0; if (typeof(_mrMap) === 'undefined') { _mrMap = {}; }");
+ _scope->invoke(init, 0, 0, 0, true);
+
+ // js function to run reduce on all keys
+// redfunc = _scope->createFunction("for (var key in hashmap) { print('Key is ' + key); list = hashmap[key]; ret = reduce(key, list); print('Value is ' + ret); };");
+ _reduceAll = _scope->createFunction("var map = _mrMap; var list, ret; for (var key in map) { list = map[key]; if (list.length != 1) { ret = _reduce(key, list); map[key] = [ret]; ++_redCt; } } _dupCt = 0;");
+ _reduceAndEmit = _scope->createFunction("var map = _mrMap; var list, ret; for (var key in map) { list = map[key]; if (list.length == 1) { ret = list[0]; } else { ret = _reduce(key, list); ++_redCt; } emit(key, ret); }; delete _mrMap;");
+ _reduceAndFinalize = _scope->createFunction("var map = _mrMap; var list, ret; for (var key in map) { list = map[key]; if (list.length == 1) { if (!_doFinal) {continue;} ret = list[0]; } else { ret = _reduce(key, list); ++_redCt; }; if (_doFinal){ ret = _finalize(key, ret); } map[key] = ret; }");
+ _reduceAndFinalizeAndInsert = _scope->createFunction("var map = _mrMap; var list, ret; for (var key in map) { list = map[key]; if (list.length == 1) { ret = list[0]; } else { ret = _reduce(key, list); ++_redCt; }; if (_doFinal){ ret = _finalize(key, ret); } _nativeToTemp({_id: key, value: ret}); }");
+
+ }
+
+ void State::switchMode(bool jsMode) {
+ _jsMode = jsMode;
+ if (jsMode) {
+ // emit function that stays in JS
+ _scope->setFunction("emit", "function(key, value) { if (typeof(key) === 'object') { _bailFromJS(key, value); return; }; ++_emitCt; var map = _mrMap; var list = map[key]; if (!list) { ++_keyCt; list = []; map[key] = list; } else { ++_dupCt; } list.push(value); }");
+ _scope->injectNative("_bailFromJS", _bailFromJS, this);
+ }
+ else {
+ // emit now populates C++ map
+ _scope->injectNative( "emit" , fast_emit, this );
+ }
+ }
+
+ void State::bailFromJS() {
+ log(1) << "M/R: Switching from JS mode to mixed mode" << endl;
+
+ // reduce and reemit into c++
+ switchMode(false);
+ _scope->invoke(_reduceAndEmit, 0, 0, 0, true);
+ // need to get the real number emitted so far
+ _numEmits = _scope->getNumberInt("_emitCt");
+ _config.reducer->numReduces = _scope->getNumberInt("_redCt");
+ }
+
+ /**
+ * Applies last reduce and finalize on a list of tuples (key, val)
+ * Inserts single result {_id: key, value: val} into temp collection
+ */
+ void State::finalReduce( BSONList& values ) {
+ if ( !_onDisk || values.size() == 0 )
+ return;
+
+ BSONObj res = _config.reducer->finalReduce( values , _config.finalizer.get() );
+ insert( _config.tempLong , res );
+ }
+
+ BSONObj _nativeToTemp( const BSONObj& args, void* data ) {
+ State* state = (State*) data;
+ BSONObjIterator it(args);
+ state->insert(state->_config.tempLong, it.next().Obj());
+ return BSONObj();
+ }
+
+// BSONObj _nativeToInc( const BSONObj& args, void* data ) {
+// State* state = (State*) data;
+// BSONObjIterator it(args);
+// const BSONObj& obj = it.next().Obj();
+// state->_insertToInc(const_cast<BSONObj&>(obj));
+// return BSONObj();
+// }
+
+ /**
+ * Applies last reduce and finalize.
+ * After calling this method, the temp collection will be completed.
+ * If inline, the results will be in the in memory map
+ */
+ void State::finalReduce( CurOp * op , ProgressMeterHolder& pm ) {
+
+ if (_jsMode) {
+ // apply the reduce within JS
+ if (_onDisk) {
+ _scope->injectNative("_nativeToTemp", _nativeToTemp, this);
+ _scope->invoke(_reduceAndFinalizeAndInsert, 0, 0, 0, true);
+ return;
+ }
+ else {
+ _scope->invoke(_reduceAndFinalize, 0, 0, 0, true);
+ return;
+ }
+ }
+
+ if ( ! _onDisk ) {
+ // all data has already been reduced, just finalize
+ if ( _config.finalizer ) {
+ long size = 0;
+ for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); ++i ) {
+ BSONObj key = i->first;
+ BSONList& all = i->second;
+
+ assert( all.size() == 1 );
+
+ BSONObj res = _config.finalizer->finalize( all[0] );
+
+ all.clear();
+ all.push_back( res );
+ size += res.objsize();
+ }
+ _size = size;
+ }
+ return;
+ }
+
+ // use index on "0" to pull sorted data
+ assert( _temp->size() == 0 );
+ BSONObj sortKey = BSON( "0" << 1 );
+ {
+ bool foundIndex = false;
+
+ auto_ptr<DBClientCursor> idx = _db.getIndexes( _config.incLong );
+ while ( idx.get() && idx->more() ) {
+ BSONObj x = idx->next();
+ if ( sortKey.woCompare( x["key"].embeddedObject() ) == 0 ) {
+ foundIndex = true;
+ break;
+ }
+ }
+
+ assert( foundIndex );
+ }
+
+ readlock rl( _config.incLong.c_str() );
+ Client::Context ctx( _config.incLong );
+
+ BSONObj prev;
+ BSONList all;
+
+ assert( pm == op->setMessage( "m/r: (3/3) final reduce to collection" , _db.count( _config.incLong, BSONObj(), QueryOption_SlaveOk ) ) );
+
+ shared_ptr<Cursor> temp = bestGuessCursor( _config.incLong.c_str() , BSONObj() , sortKey );
+ auto_ptr<ClientCursor> cursor( new ClientCursor( QueryOption_NoCursorTimeout , temp , _config.incLong.c_str() ) );
+
+ // iterate over all sorted objects
+ while ( cursor->ok() ) {
+ BSONObj o = cursor->current().getOwned();
+ cursor->advance();
+
+ pm.hit();
+
+ if ( o.woSortOrder( prev , sortKey ) == 0 ) {
+ // object is same as previous, add to array
+ all.push_back( o );
+ if ( pm->hits() % 1000 == 0 ) {
+ if ( ! cursor->yield() ) {
+ cursor.release();
+ break;
+ }
+ killCurrentOp.checkForInterrupt();
+ }
+ continue;
+ }
+
+ ClientCursor::YieldLock yield (cursor.get());
+
+ try {
+ // reduce a finalize array
+ finalReduce( all );
+ }
+ catch (...) {
+ yield.relock();
+ cursor.release();
+ throw;
+ }
+
+ all.clear();
+ prev = o;
+ all.push_back( o );
+
+ if ( ! yield.stillOk() ) {
+ cursor.release();
+ break;
+ }
+
+ killCurrentOp.checkForInterrupt();
+ }
+
+ // we need to release here since we temp release below
+ cursor.release();
+
+ {
+ dbtempreleasecond tl;
+ if ( ! tl.unlocked() )
+ log( LL_WARNING ) << "map/reduce can't temp release" << endl;
+ // reduce and finalize last array
+ finalReduce( all );
+ }
+
+ pm.finished();
+ }
+
+ /**
+ * Attempts to reduce objects in the memory map.
+ * A new memory map will be created to hold the results.
+ * If applicable, objects with unique key may be dumped to inc collection.
+ * Input and output objects are both {"0": key, "1": val}
+ */
+ void State::reduceInMemory() {
+
+ if (_jsMode) {
+ // in js mode the reduce is applied when writing to collection
+ return;
+ }
+
+ auto_ptr<InMemory> n( new InMemory() ); // for new data
+ long nSize = 0;
+ _dupCount = 0;
+
+ for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); ++i ) {
+ BSONObj key = i->first;
+ BSONList& all = i->second;
+
+ if ( all.size() == 1 ) {
+ // only 1 value for this key
+ if ( _onDisk ) {
+ // this key has low cardinality, so just write to collection
+ writelock l(_config.incLong);
+ Client::Context ctx(_config.incLong.c_str());
+ _insertToInc( *(all.begin()) );
+ }
+ else {
+ // add to new map
+ _add( n.get() , all[0] , nSize );
+ }
+ }
+ else if ( all.size() > 1 ) {
+ // several values, reduce and add to map
+ BSONObj res = _config.reducer->reduce( all );
+ _add( n.get() , res , nSize );
+ }
+ }
+
+ // swap maps
+ _temp.reset( n.release() );
+ _size = nSize;
+ }
+
+ /**
+ * Dumps the entire in memory map to the inc collection.
+ */
+ void State::dumpToInc() {
+ if ( ! _onDisk )
+ return;
+
+ writelock l(_config.incLong);
+ Client::Context ctx(_config.incLong);
+
+ for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); i++ ) {
+ BSONList& all = i->second;
+ if ( all.size() < 1 )
+ continue;
+
+ for ( BSONList::iterator j=all.begin(); j!=all.end(); j++ )
+ _insertToInc( *j );
+ }
+ _temp->clear();
+ _size = 0;
+
+ }
+
+ /**
+ * Adds object to in memory map
+ */
+ void State::emit( const BSONObj& a ) {
+ _numEmits++;
+ _add( _temp.get() , a , _size );
+ }
+
+ void State::_add( InMemory* im, const BSONObj& a , long& size ) {
+ BSONList& all = (*im)[a];
+ all.push_back( a );
+ size += a.objsize() + 16;
+ if (all.size() > 1)
+ ++_dupCount;
+ }
+
+ /**
+ * this method checks the size of in memory map and potentially flushes to disk
+ */
+ void State::checkSize() {
+ if (_jsMode) {
+ // try to reduce if it is beneficial
+ int dupCt = _scope->getNumberInt("_dupCt");
+ int keyCt = _scope->getNumberInt("_keyCt");
+
+ if (keyCt > _config.jsMaxKeys) {
+ // too many keys for JS, switch to mixed
+ _bailFromJS(BSONObj(), this);
+ // then fall through to check map size
+ }
+ else if (dupCt > (keyCt * _config.reduceTriggerRatio)) {
+ // reduce now to lower mem usage
+ Timer t;
+ _scope->invoke(_reduceAll, 0, 0, 0, true);
+ log(1) << " MR - did reduceAll: keys=" << keyCt << " dups=" << dupCt << " newKeys=" << _scope->getNumberInt("_keyCt") << " time=" << t.millis() << "ms" << endl;
+ return;
+ }
+ }
+
+ if (_jsMode)
+ return;
+
+ if (_size > _config.maxInMemSize || _dupCount > (_temp->size() * _config.reduceTriggerRatio)) {
+ // attempt to reduce in memory map, if memory is too high or we have many duplicates
+ long oldSize = _size;
+ Timer t;
+ reduceInMemory();
+ log(1) << " MR - did reduceInMemory: size=" << oldSize << " dups=" << _dupCount << " newSize=" << _size << " time=" << t.millis() << "ms" << endl;
+
+ // if size is still high, or values are not reducing well, dump
+ if ( _onDisk && (_size > _config.maxInMemSize || _size > oldSize / 2) ) {
+ dumpToInc();
+ log(1) << " MR - dumping to db" << endl;
+ }
+ }
+ }
+
+ /**
+ * emit that will be called by js function
+ */
+ BSONObj fast_emit( const BSONObj& args, void* data ) {
+ uassert( 10077 , "fast_emit takes 2 args" , args.nFields() == 2 );
+ uassert( 13069 , "an emit can't be more than half max bson size" , args.objsize() < ( BSONObjMaxUserSize / 2 ) );
+
+ State* state = (State*) data;
+ if ( args.firstElement().type() == Undefined ) {
+ BSONObjBuilder b( args.objsize() );
+ b.appendNull( "" );
+ BSONObjIterator i( args );
+ i.next();
+ b.append( i.next() );
+ state->emit( b.obj() );
+ }
+ else {
+ state->emit( args );
+ }
+ return BSONObj();
+ }
+
+ /**
+ * function is called when we realize we cant use js mode for m/r on the 1st key
+ */
+ BSONObj _bailFromJS( const BSONObj& args, void* data ) {
+ State* state = (State*) data;
+ state->bailFromJS();
+
+ // emit this particular key if there is one
+ if (!args.isEmpty()) {
+ fast_emit(args, data);
+ }
+ return BSONObj();
+ }
+
+ /**
+ * This class represents a map/reduce command executed on a single server
+ */
+ class MapReduceCommand : public Command {
+ public:
+ MapReduceCommand() : Command("mapReduce", false, "mapreduce") {}
+
+ /* why !replset ?
+ bad things happen with --slave (i think because of this)
+ */
+ virtual bool slaveOk() const { return !replSet; }
+
+ virtual bool slaveOverrideOk() { return true; }
+
+ virtual void help( stringstream &help ) const {
+ help << "Run a map/reduce operation on the server.\n";
+ help << "Note this is used for aggregation, not querying, in MongoDB.\n";
+ help << "http://www.mongodb.org/display/DOCS/MapReduce";
+ }
+
+ virtual LockType locktype() const { return NONE; }
+
+ bool run(const string& dbname , BSONObj& cmd, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+ Timer t;
+ Client& client = cc();
+ CurOp * op = client.curop();
+
+ Config config( dbname , cmd );
+
+ log(1) << "mr ns: " << config.ns << endl;
+
+ bool shouldHaveData = false;
+
+ long long num = 0;
+ long long inReduce = 0;
+
+ BSONObjBuilder countsBuilder;
+ BSONObjBuilder timingBuilder;
+ State state( config );
+ if ( ! state.sourceExists() ) {
+ errmsg = "ns doesn't exist";
+ return false;
+ }
+
+ if (replSet && state.isOnDisk()) {
+ // this means that it will be doing a write operation, make sure we are on Master
+ // ideally this check should be in slaveOk(), but at that point config is not known
+ if (!isMaster(dbname.c_str())) {
+ errmsg = "not master";
+ return false;
+ }
+ }
+
+ if (state.isOnDisk() && !client.getAuthenticationInfo()->isAuthorized(dbname)) {
+ errmsg = "read-only user cannot output mapReduce to collection, use inline instead";
+ return false;
+ }
+
+ try {
+ state.init();
+ state.prepTempCollection();
+ ProgressMeterHolder pm( op->setMessage( "m/r: (1/3) emit phase" , state.incomingDocuments() ) );
+
+ wassert( config.limit < 0x4000000 ); // see case on next line to 32 bit unsigned
+ long long mapTime = 0;
+ {
+ readlock lock( config.ns );
+ Client::Context ctx( config.ns );
+
+ ShardChunkManagerPtr chunkManager;
+ if ( shardingState.needShardChunkManager( config.ns ) ) {
+ chunkManager = shardingState.getShardChunkManager( config.ns );
+ }
+
+ // obtain cursor on data to apply mr to, sorted
+ shared_ptr<Cursor> temp = NamespaceDetailsTransient::getCursor( config.ns.c_str(), config.filter, config.sort );
+ uassert( 15876, str::stream() << "could not create cursor over " << config.ns << " for query : " << config.filter << " sort : " << config.sort, temp.get() );
+ auto_ptr<ClientCursor> cursor( new ClientCursor( QueryOption_NoCursorTimeout , temp , config.ns.c_str() ) );
+ uassert( 15877, str::stream() << "could not create client cursor over " << config.ns << " for query : " << config.filter << " sort : " << config.sort, cursor.get() );
+
+ Timer mt;
+ // go through each doc
+ while ( cursor->ok() ) {
+ if ( ! cursor->currentMatches() ) {
+ cursor->advance();
+ continue;
+ }
+
+ // make sure we dont process duplicates in case data gets moved around during map
+ // TODO This won't actually help when data gets moved, it's to handle multikeys.
+ if ( cursor->currentIsDup() ) {
+ cursor->advance();
+ continue;
+ }
+
+ BSONObj o = cursor->current();
+ cursor->advance();
+
+ // check to see if this is a new object we don't own yet
+ // because of a chunk migration
+ if ( chunkManager && ! chunkManager->belongsToMe( o ) )
+ continue;
+
+ // do map
+ if ( config.verbose ) mt.reset();
+ config.mapper->map( o );
+ if ( config.verbose ) mapTime += mt.micros();
+
+ num++;
+ if ( num % 1000 == 0 ) {
+ // try to yield lock regularly
+ ClientCursor::YieldLock yield (cursor.get());
+ Timer t;
+ // check if map needs to be dumped to disk
+ state.checkSize();
+ inReduce += t.micros();
+
+ if ( ! yield.stillOk() ) {
+ cursor.release();
+ break;
+ }
+
+ killCurrentOp.checkForInterrupt();
+ }
+ pm.hit();
+
+ if ( config.limit && num >= config.limit )
+ break;
+ }
+ }
+ pm.finished();
+
+ killCurrentOp.checkForInterrupt();
+ // update counters
+ countsBuilder.appendNumber( "input" , num );
+ countsBuilder.appendNumber( "emit" , state.numEmits() );
+ if ( state.numEmits() )
+ shouldHaveData = true;
+
+ timingBuilder.append( "mapTime" , mapTime / 1000 );
+ timingBuilder.append( "emitLoop" , t.millis() );
+
+ op->setMessage( "m/r: (2/3) final reduce in memory" );
+ Timer t;
+ // do reduce in memory
+ // this will be the last reduce needed for inline mode
+ state.reduceInMemory();
+ // if not inline: dump the in memory map to inc collection, all data is on disk
+ state.dumpToInc();
+ // final reduce
+ state.finalReduce( op , pm );
+ inReduce += t.micros();
+ countsBuilder.appendNumber( "reduce" , state.numReduces() );
+ timingBuilder.append( "reduceTime" , inReduce / 1000 );
+ timingBuilder.append( "mode" , state.jsMode() ? "js" : "mixed" );
+
+ long long finalCount = state.postProcessCollection(op, pm);
+ state.appendResults( result );
+
+ timingBuilder.append( "total" , t.millis() );
+ result.append( "timeMillis" , t.millis() );
+ countsBuilder.appendNumber( "output" , finalCount );
+ if ( config.verbose ) result.append( "timing" , timingBuilder.obj() );
+ result.append( "counts" , countsBuilder.obj() );
+
+ if ( finalCount == 0 && shouldHaveData ) {
+ result.append( "cmd" , cmd );
+ errmsg = "there were emits but no data!";
+ return false;
+ }
+
+ }
+ catch( SendStaleConfigException& e ){
+ log() << "mr detected stale config, should retry" << causedBy(e) << endl;
+ throw e;
+ }
+ // TODO: The error handling code for queries is v. fragile,
+ // *requires* rethrow AssertionExceptions - should probably fix.
+ catch ( AssertionException& e ){
+ log() << "mr failed, removing collection" << causedBy(e) << endl;
+ throw e;
+ }
+ catch ( std::exception& e ){
+ log() << "mr failed, removing collection" << causedBy(e) << endl;
+ throw e;
+ }
+ catch ( ... ) {
+ log() << "mr failed for unknown reason, removing collection" << endl;
+ throw;
+ }
+
+ return true;
+ }
+
+ } mapReduceCommand;
+
+ /**
+ * This class represents a map/reduce command executed on the output server of a sharded env
+ */
+ class MapReduceFinishCommand : public Command {
+ public:
+ MapReduceFinishCommand() : Command( "mapreduce.shardedfinish" ) {}
+ virtual bool slaveOk() const { return !replSet; }
+ virtual bool slaveOverrideOk() { return true; }
+
+ virtual LockType locktype() const { return NONE; }
+ bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+ ShardedConnectionInfo::addHook();
+ // legacy name
+ string shardedOutputCollection = cmdObj["shardedOutputCollection"].valuestrsafe();
+ string inputNS = cmdObj["inputNS"].valuestrsafe();
+ if (inputNS.empty())
+ inputNS = dbname + "." + shardedOutputCollection;
+
+ Client& client = cc();
+ CurOp * op = client.curop();
+
+ Config config( dbname , cmdObj.firstElement().embeddedObjectUserCheck() );
+ State state(config);
+ state.init();
+
+ // no need for incremental collection because records are already sorted
+ config.incLong = config.tempLong;
+
+ BSONObj shardCounts = cmdObj["shardCounts"].embeddedObjectUserCheck();
+ BSONObj counts = cmdObj["counts"].embeddedObjectUserCheck();
+
+ ProgressMeterHolder pm( op->setMessage( "m/r: merge sort and reduce" ) );
+ set<ServerAndQuery> servers;
+ vector< auto_ptr<DBClientCursor> > shardCursors;
+
+ {
+ // parse per shard results
+ BSONObjIterator i( shardCounts );
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ string shard = e.fieldName();
+// BSONObj res = e.embeddedObjectUserCheck();
+ servers.insert( shard );
+ }
+ }
+
+ state.prepTempCollection();
+
+ BSONList values;
+ if (!config.outDB.empty()) {
+ BSONObjBuilder loc;
+ if ( !config.outDB.empty())
+ loc.append( "db" , config.outDB );
+ if ( !config.finalShort.empty() )
+ loc.append( "collection" , config.finalShort );
+ result.append("result", loc.obj());
+ }
+ else {
+ if ( !config.finalShort.empty() )
+ result.append( "result" , config.finalShort );
+ }
+
+ // fetch result from other shards 1 chunk at a time
+ // it would be better to do just one big $or query, but then the sorting would not be efficient
+ string shardName = shardingState.getShardName();
+ DBConfigPtr confOut = grid.getDBConfig( dbname , false );
+ vector<ChunkPtr> chunks;
+ if ( confOut->isSharded(config.finalLong) ) {
+ ChunkManagerPtr cm = confOut->getChunkManager( config.finalLong );
+ const ChunkMap& chunkMap = cm->getChunkMap();
+ for ( ChunkMap::const_iterator it = chunkMap.begin(); it != chunkMap.end(); ++it ) {
+ ChunkPtr chunk = it->second;
+ if (chunk->getShard().getName() == shardName) chunks.push_back(chunk);
+ }
+ }
+
+ long long inputCount = 0;
+ unsigned int index = 0;
+ BSONObj query;
+ BSONArrayBuilder chunkSizes;
+ while (true) {
+ ChunkPtr chunk;
+ if (chunks.size() > 0) {
+ chunk = chunks[index];
+ BSONObjBuilder b;
+ b.appendAs(chunk->getMin().firstElement(), "$gte");
+ b.appendAs(chunk->getMax().firstElement(), "$lt");
+ query = BSON("_id" << b.obj());
+// chunkSizes.append(min);
+ }
+
+ // reduce from each shard for a chunk
+ BSONObj sortKey = BSON( "_id" << 1 );
+ ParallelSortClusteredCursor cursor( servers , inputNS , Query( query ).sort( sortKey ) );
+ cursor.init();
+ int chunkSize = 0;
+
+ while ( cursor.more() || !values.empty() ) {
+ BSONObj t;
+ if (cursor.more()) {
+ t = cursor.next().getOwned();
+ ++inputCount;
+
+ if ( values.size() == 0 ) {
+ values.push_back( t );
+ continue;
+ }
+
+ if ( t.woSortOrder( *(values.begin()) , sortKey ) == 0 ) {
+ values.push_back( t );
+ continue;
+ }
+ }
+
+ BSONObj res = config.reducer->finalReduce( values , config.finalizer.get());
+ chunkSize += res.objsize();
+ if (state.isOnDisk())
+ state.insertToInc(res);
+ else
+ state.emit(res);
+ values.clear();
+ if (!t.isEmpty())
+ values.push_back( t );
+ }
+
+ if (chunk) {
+ chunkSizes.append(chunk->getMin());
+ chunkSizes.append(chunkSize);
+ }
+ if (++index >= chunks.size())
+ break;
+ }
+
+ result.append( "chunkSizes" , chunkSizes.arr() );
+
+ long long outputCount = state.postProcessCollection(op, pm);
+ state.appendResults( result );
+
+ BSONObjBuilder countsB(32);
+ countsB.append("input", inputCount);
+ countsB.append("reduce", state.numReduces());
+ countsB.append("output", outputCount);
+ result.append( "counts" , countsB.obj() );
+
+ return 1;
+ }
+ } mapReduceFinishCommand;
+
+ }
+
+}
+
diff --git a/src/mongo/db/commands/mr.h b/src/mongo/db/commands/mr.h
new file mode 100644
index 00000000000..592769d82da
--- /dev/null
+++ b/src/mongo/db/commands/mr.h
@@ -0,0 +1,319 @@
+// mr.h
+
+/**
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+namespace mongo {
+
+ namespace mr {
+
+ typedef vector<BSONObj> BSONList;
+
+ class State;
+
+ // ------------ function interfaces -----------
+
+ class Mapper : boost::noncopyable {
+ public:
+ virtual ~Mapper() {}
+ virtual void init( State * state ) = 0;
+
+ virtual void map( const BSONObj& o ) = 0;
+ };
+
+ class Finalizer : boost::noncopyable {
+ public:
+ virtual ~Finalizer() {}
+ virtual void init( State * state ) = 0;
+
+ /**
+ * this takes a tuple and returns a tuple
+ */
+ virtual BSONObj finalize( const BSONObj& tuple ) = 0;
+ };
+
+ class Reducer : boost::noncopyable {
+ public:
+ Reducer() : numReduces(0) {}
+ virtual ~Reducer() {}
+ virtual void init( State * state ) = 0;
+
+ virtual BSONObj reduce( const BSONList& tuples ) = 0;
+ /** this means its a final reduce, even if there is no finalizer */
+ virtual BSONObj finalReduce( const BSONList& tuples , Finalizer * finalizer ) = 0;
+
+ long long numReduces;
+ };
+
+ // ------------ js function implementations -----------
+
+ /**
+ * used as a holder for Scope and ScriptingFunction
+ * visitor like pattern as Scope is gotten from first access
+ */
+ class JSFunction : boost::noncopyable {
+ public:
+ /**
+ * @param type (map|reduce|finalize)
+ */
+ JSFunction( string type , const BSONElement& e );
+ virtual ~JSFunction() {}
+
+ virtual void init( State * state );
+
+ Scope * scope() const { return _scope; }
+ ScriptingFunction func() const { return _func; }
+
+ private:
+ string _type;
+ string _code; // actual javascript code
+ BSONObj _wantedScope; // this is for CodeWScope
+
+ Scope * _scope; // this is not owned by us, and might be shared
+ ScriptingFunction _func;
+ };
+
+ class JSMapper : public Mapper {
+ public:
+ JSMapper( const BSONElement & code ) : _func( "_map" , code ) {}
+ virtual void map( const BSONObj& o );
+ virtual void init( State * state );
+
+ private:
+ JSFunction _func;
+ BSONObj _params;
+ };
+
+ class JSReducer : public Reducer {
+ public:
+ JSReducer( const BSONElement& code ) : _func( "_reduce" , code ) {}
+ virtual void init( State * state );
+
+ virtual BSONObj reduce( const BSONList& tuples );
+ virtual BSONObj finalReduce( const BSONList& tuples , Finalizer * finalizer );
+
+ private:
+
+ /**
+ * result in "return"
+ * @param key OUT
+ * @param endSizeEstimate OUT
+ */
+ void _reduce( const BSONList& values , BSONObj& key , int& endSizeEstimate );
+
+ JSFunction _func;
+ };
+
+ class JSFinalizer : public Finalizer {
+ public:
+ JSFinalizer( const BSONElement& code ) : _func( "_finalize" , code ) {}
+ virtual BSONObj finalize( const BSONObj& o );
+ virtual void init( State * state ) { _func.init( state ); }
+ private:
+ JSFunction _func;
+
+ };
+
+ // -----------------
+
+
+ class TupleKeyCmp {
+ public:
+ TupleKeyCmp() {}
+ bool operator()( const BSONObj &l, const BSONObj &r ) const {
+ return l.firstElement().woCompare( r.firstElement() ) < 0;
+ }
+ };
+
+ typedef map< BSONObj,BSONList,TupleKeyCmp > InMemory; // from key to list of tuples
+
+ /**
+ * holds map/reduce config information
+ */
+ class Config {
+ public:
+ Config( const string& _dbname , const BSONObj& cmdObj );
+
+ string dbname;
+ string ns;
+
+ // options
+ bool verbose;
+ bool jsMode;
+ int splitInfo;
+
+ // query options
+
+ BSONObj filter;
+ BSONObj sort;
+ long long limit;
+
+ // functions
+
+ scoped_ptr<Mapper> mapper;
+ scoped_ptr<Reducer> reducer;
+ scoped_ptr<Finalizer> finalizer;
+
+ BSONObj mapParams;
+ BSONObj scopeSetup;
+
+ // output tables
+ string incLong;
+ string tempLong;
+
+ string finalShort;
+ string finalLong;
+
+ string outDB;
+
+ // max number of keys allowed in JS map before switching mode
+ long jsMaxKeys;
+ // ratio of duplicates vs unique keys before reduce is triggered in js mode
+ float reduceTriggerRatio;
+ // maximum size of map before it gets dumped to disk
+ long maxInMemSize;
+
+ enum { REPLACE , // atomically replace the collection
+ MERGE , // merge keys, override dups
+ REDUCE , // merge keys, reduce dups
+ INMEMORY // only store in memory, limited in size
+ } outType;
+
+ // if true, no lock during output operation
+ bool outNonAtomic;
+
+ static AtomicUInt JOB_NUMBER;
+ }; // end MRsetup
+
+ /**
+ * stores information about intermediate map reduce state
+ * controls flow of data from map->reduce->finalize->output
+ */
+ class State {
+ public:
+ State( const Config& c );
+ ~State();
+
+ void init();
+
+ // ---- prep -----
+ bool sourceExists();
+
+ long long incomingDocuments();
+
+ // ---- map stage ----
+
+ /**
+ * stages on in in-memory storage
+ */
+ void emit( const BSONObj& a );
+
+ /**
+ * if size is big, run a reduce
+ * if its still big, dump to temp collection
+ */
+ void checkSize();
+
+ /**
+ * run reduce on _temp
+ */
+ void reduceInMemory();
+
+ /**
+ * transfers in memory storage to temp collection
+ */
+ void dumpToInc();
+ void insertToInc( BSONObj& o );
+ void _insertToInc( BSONObj& o );
+
+ // ------ reduce stage -----------
+
+ void prepTempCollection();
+
+ void finalReduce( BSONList& values );
+
+ void finalReduce( CurOp * op , ProgressMeterHolder& pm );
+
+ // ------- cleanup/data positioning ----------
+
+ /**
+ @return number objects in collection
+ */
+ long long postProcessCollection( CurOp* op , ProgressMeterHolder& pm );
+ long long postProcessCollectionNonAtomic( CurOp* op , ProgressMeterHolder& pm );
+
+ /**
+ * if INMEMORY will append
+ * may also append stats or anything else it likes
+ */
+ void appendResults( BSONObjBuilder& b );
+
+ // -------- util ------------
+
+ /**
+ * inserts with correct replication semantics
+ */
+ void insert( const string& ns , const BSONObj& o );
+
+ // ------ simple accessors -----
+
+ /** State maintains ownership, do no use past State lifetime */
+ Scope* scope() { return _scope.get(); }
+
+ const Config& config() { return _config; }
+
+ const bool isOnDisk() { return _onDisk; }
+
+ long long numEmits() const { if (_jsMode) return _scope->getNumberLongLong("_emitCt"); return _numEmits; }
+ long long numReduces() const { if (_jsMode) return _scope->getNumberLongLong("_redCt"); return _config.reducer->numReduces; }
+
+ bool jsMode() {return _jsMode;}
+ void switchMode(bool jsMode);
+ void bailFromJS();
+
+ const Config& _config;
+ DBDirectClient _db;
+
+ protected:
+
+ void _add( InMemory* im , const BSONObj& a , long& size );
+
+ scoped_ptr<Scope> _scope;
+ bool _onDisk; // if the end result of this map reduce is disk or not
+
+ scoped_ptr<InMemory> _temp;
+ long _size; // bytes in _temp
+ long _dupCount; // number of duplicate key entries
+
+ long long _numEmits;
+
+ bool _jsMode;
+ ScriptingFunction _reduceAll;
+ ScriptingFunction _reduceAndEmit;
+ ScriptingFunction _reduceAndFinalize;
+ ScriptingFunction _reduceAndFinalizeAndInsert;
+ };
+
+ BSONObj fast_emit( const BSONObj& args, void* data );
+ BSONObj _bailFromJS( const BSONObj& args, void* data );
+
+ } // end mr namespace
+}
+
+
diff --git a/src/mongo/db/commands/pipeline.cpp b/src/mongo/db/commands/pipeline.cpp
new file mode 100755
index 00000000000..4ad5e342aed
--- /dev/null
+++ b/src/mongo/db/commands/pipeline.cpp
@@ -0,0 +1,405 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/commands/pipeline.h"
+
+#include "db/cursor.h"
+#include "db/pipeline/accumulator.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/document_source.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pdfile.h"
+#include "util/mongoutils/str.h"
+
+namespace mongo {
+
+ const char Pipeline::commandName[] = "aggregate";
+ const char Pipeline::pipelineName[] = "pipeline";
+ const char Pipeline::fromRouterName[] = "fromRouter";
+ const char Pipeline::splitMongodPipelineName[] = "splitMongodPipeline";
+
+ Pipeline::~Pipeline() {
+ }
+
+ Pipeline::Pipeline(const intrusive_ptr<ExpressionContext> &pTheCtx):
+ collectionName(),
+ sourceVector(),
+ splitMongodPipeline(DEBUG_BUILD == 1), /* test: always split for DEV */
+ pCtx(pTheCtx) {
+ }
+
+
+
+ /* this structure is used to make a lookup table of operators */
+ struct StageDesc {
+ const char *pName;
+ intrusive_ptr<DocumentSource> (*pFactory)(
+ BSONElement *, const intrusive_ptr<ExpressionContext> &);
+ };
+
+ /* this table must be in alphabetical order by name for bsearch() */
+ static const StageDesc stageDesc[] = {
+#ifdef NEVER /* disabled for now in favor of $match */
+ {DocumentSourceFilter::filterName,
+ DocumentSourceFilter::createFromBson},
+#endif
+ {DocumentSourceGroup::groupName,
+ DocumentSourceGroup::createFromBson},
+ {DocumentSourceLimit::limitName,
+ DocumentSourceLimit::createFromBson},
+ {DocumentSourceMatch::matchName,
+ DocumentSourceMatch::createFromBson},
+#ifdef LATER /* https://jira.mongodb.org/browse/SERVER-3253 */
+ {DocumentSourceOut::outName,
+ DocumentSourceOut::createFromBson},
+#endif
+ {DocumentSourceProject::projectName,
+ DocumentSourceProject::createFromBson},
+ {DocumentSourceSkip::skipName,
+ DocumentSourceSkip::createFromBson},
+ {DocumentSourceSort::sortName,
+ DocumentSourceSort::createFromBson},
+ {DocumentSourceUnwind::unwindName,
+ DocumentSourceUnwind::createFromBson},
+ };
+ static const size_t nStageDesc = sizeof(stageDesc) / sizeof(StageDesc);
+
+ static int stageDescCmp(const void *pL, const void *pR) {
+ return strcmp(((const StageDesc *)pL)->pName,
+ ((const StageDesc *)pR)->pName);
+ }
+
+ boost::shared_ptr<Pipeline> Pipeline::parseCommand(
+ string &errmsg, BSONObj &cmdObj,
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ boost::shared_ptr<Pipeline> pPipeline(new Pipeline(pCtx));
+ vector<BSONElement> pipeline;
+
+ /* gather the specification for the aggregation */
+ for(BSONObj::iterator cmdIterator = cmdObj.begin();
+ cmdIterator.more(); ) {
+ BSONElement cmdElement(cmdIterator.next());
+ const char *pFieldName = cmdElement.fieldName();
+
+ /* look for the aggregation command */
+ if (!strcmp(pFieldName, commandName)) {
+ pPipeline->collectionName = cmdElement.String();
+ continue;
+ }
+
+ /* check for the collection name */
+ if (!strcmp(pFieldName, pipelineName)) {
+ pipeline = cmdElement.Array();
+ continue;
+ }
+
+ /* if the request came from the router, we're in a shard */
+ if (!strcmp(pFieldName, fromRouterName)) {
+ pCtx->setInShard(cmdElement.Bool());
+ continue;
+ }
+
+ /* check for debug options */
+ if (!strcmp(pFieldName, splitMongodPipelineName)) {
+ pPipeline->splitMongodPipeline = true;
+ continue;
+ }
+
+ /* we didn't recognize a field in the command */
+ ostringstream sb;
+ sb <<
+ "Pipeline::parseCommand(): unrecognized field \"" <<
+ cmdElement.fieldName();
+ errmsg = sb.str();
+ return boost::shared_ptr<Pipeline>();
+ }
+
+ /*
+ If we get here, we've harvested the fields we expect for a pipeline.
+
+ Set up the specified document source pipeline.
+ */
+ SourceVector *pSourceVector = &pPipeline->sourceVector; // shorthand
+
+ /* iterate over the steps in the pipeline */
+ const size_t nSteps = pipeline.size();
+ for(size_t iStep = 0; iStep < nSteps; ++iStep) {
+ /* pull out the pipeline element as an object */
+ BSONElement pipeElement(pipeline[iStep]);
+ uassert(15942, str::stream() << "pipeline element " <<
+ iStep << " is not an object",
+ pipeElement.type() == Object);
+ BSONObj bsonObj(pipeElement.Obj());
+
+ intrusive_ptr<DocumentSource> pSource;
+
+ /* use the object to add a DocumentSource to the processing chain */
+ BSONObjIterator bsonIterator(bsonObj);
+ while(bsonIterator.more()) {
+ BSONElement bsonElement(bsonIterator.next());
+ const char *pFieldName = bsonElement.fieldName();
+
+ /* select the appropriate operation and instantiate */
+ StageDesc key;
+ key.pName = pFieldName;
+ const StageDesc *pDesc = (const StageDesc *)
+ bsearch(&key, stageDesc, nStageDesc, sizeof(StageDesc),
+ stageDescCmp);
+ if (pDesc)
+ pSource = (*pDesc->pFactory)(&bsonElement, pCtx);
+ else {
+ ostringstream sb;
+ sb <<
+ "Pipeline::run(): unrecognized pipeline op \"" <<
+ pFieldName;
+ errmsg = sb.str();
+ return shared_ptr<Pipeline>();
+ }
+ }
+
+ pSourceVector->push_back(pSource);
+ }
+
+ /* if there aren't any pipeline stages, there's nothing more to do */
+ if (!pSourceVector->size())
+ return pPipeline;
+
+ /*
+ Move filters up where possible.
+
+ CW TODO -- move filter past projections where possible, and noting
+ corresponding field renaming.
+ */
+
+ /*
+ Wherever there is a match immediately following a sort, swap them.
+ This means we sort fewer items. Neither changes the documents in
+ the stream, so this transformation shouldn't affect the result.
+
+ We do this first, because then when we coalesce operators below,
+ any adjacent matches will be combined.
+ */
+ for(size_t srcn = pSourceVector->size(), srci = 1;
+ srci < srcn; ++srci) {
+ intrusive_ptr<DocumentSource> &pSource = pSourceVector->at(srci);
+ if (dynamic_cast<DocumentSourceMatch *>(pSource.get())) {
+ intrusive_ptr<DocumentSource> &pPrevious =
+ pSourceVector->at(srci - 1);
+ if (dynamic_cast<DocumentSourceSort *>(pPrevious.get())) {
+ /* swap this item with the previous */
+ intrusive_ptr<DocumentSource> pTemp(pPrevious);
+ pPrevious = pSource;
+ pSource = pTemp;
+ }
+ }
+ }
+
+ /*
+ Coalesce adjacent filters where possible. Two adjacent filters
+ are equivalent to one filter whose predicate is the conjunction of
+ the two original filters' predicates. For now, capture this by
+ giving any DocumentSource the option to absorb it's successor; this
+ will also allow adjacent projections to coalesce when possible.
+
+ Run through the DocumentSources, and give each one the opportunity
+ to coalesce with its successor. If successful, remove the
+ successor.
+
+ Move all document sources to a temporary list.
+ */
+ SourceVector tempVector(*pSourceVector);
+ pSourceVector->clear();
+
+ /* move the first one to the final list */
+ pSourceVector->push_back(tempVector[0]);
+
+ /* run through the sources, coalescing them or keeping them */
+ for(size_t tempn = tempVector.size(), tempi = 1;
+ tempi < tempn; ++tempi) {
+ /*
+ If we can't coalesce the source with the last, then move it
+ to the final list, and make it the new last. (If we succeeded,
+ then we're still on the same last, and there's no need to move
+ or do anything with the source -- the destruction of tempVector
+ will take care of the rest.)
+ */
+ intrusive_ptr<DocumentSource> &pLastSource = pSourceVector->back();
+ intrusive_ptr<DocumentSource> &pTemp = tempVector.at(tempi);
+ if (!pLastSource->coalesce(pTemp))
+ pSourceVector->push_back(pTemp);
+ }
+
+ /* optimize the elements in the pipeline */
+ for(SourceVector::iterator iter(pSourceVector->begin()),
+ listEnd(pSourceVector->end()); iter != listEnd; ++iter)
+ (*iter)->optimize();
+
+ return pPipeline;
+ }
+
+ shared_ptr<Pipeline> Pipeline::splitForSharded() {
+ /* create an initialize the shard spec we'll return */
+ shared_ptr<Pipeline> pShardPipeline(new Pipeline(pCtx));
+ pShardPipeline->collectionName = collectionName;
+
+ /* put the source list aside */
+ SourceVector tempVector(sourceVector);
+ sourceVector.clear();
+
+ /*
+ Run through the pipeline, looking for points to split it into
+ shard pipelines, and the rest.
+ */
+ while(!tempVector.empty()) {
+ intrusive_ptr<DocumentSource> &pSource = tempVector.front();
+
+#ifdef MONGODB_SERVER3832 /* see https://jira.mongodb.org/browse/SERVER-3832 */
+ DocumentSourceSort *pSort =
+ dynamic_cast<DocumentSourceSort *>(pSource.get());
+ if (pSort) {
+ /*
+ There's no point in sorting until the result is combined.
+ Therefore, sorts should be done in mongos, and not in
+ the shard at all. Add all the remaining operators to
+ the mongos list and quit.
+
+ TODO: unless the sort key is the shard key.
+ TODO: we could also do a merge sort in mongos in the
+ future, and split here.
+ */
+ for(size_t tempn = tempVector.size(), tempi = 0;
+ tempi < tempn; ++tempi)
+ sourceVector.push_back(tempVector[tempi]);
+ break;
+ }
+#endif
+
+ /* hang on to this in advance, in case it is a group */
+ DocumentSourceGroup *pGroup =
+ dynamic_cast<DocumentSourceGroup *>(pSource.get());
+
+ /* move the source from the tempVector to the shard sourceVector */
+ pShardPipeline->sourceVector.push_back(pSource);
+ tempVector.erase(tempVector.begin());
+
+ /*
+ If we found a group, that's a split point.
+ */
+ if (pGroup) {
+ /* start this pipeline with the group merger */
+ sourceVector.push_back(pGroup->createMerger());
+
+ /* and then add everything that remains and quit */
+ for(size_t tempn = tempVector.size(), tempi = 0;
+ tempi < tempn; ++tempi)
+ sourceVector.push_back(tempVector[tempi]);
+ break;
+ }
+ }
+
+ return pShardPipeline;
+ }
+
+ void Pipeline::getCursorMods(BSONObjBuilder *pQueryBuilder,
+ BSONObjBuilder *pSortBuilder) {
+ /* look for an initial $match */
+ if (!sourceVector.size())
+ return;
+ const intrusive_ptr<DocumentSource> &pMC = sourceVector.front();
+ const DocumentSourceMatch *pMatch =
+ dynamic_cast<DocumentSourceMatch *>(pMC.get());
+
+ if (pMatch) {
+ /* build the query */
+ pMatch->toMatcherBson(pQueryBuilder);
+
+ /* remove the match from the pipeline */
+ sourceVector.erase(sourceVector.begin());
+ }
+
+ /* look for an initial $sort */
+ if (!sourceVector.size())
+ return;
+#ifdef MONGODB_SERVER3832 /* see https://jira.mongodb.org/browse/SERVER-3832 */
+ const intrusive_ptr<DocumentSource> &pSC = sourceVector.front();
+ const DocumentSourceSort *pSort =
+ dynamic_cast<DocumentSourceSort *>(pSC.get());
+
+ if (pSort) {
+ /* build the sort key */
+ pSort->sortKeyToBson(pSortBuilder, false);
+
+ /* remove the sort from the pipeline */
+ sourceVector.erase(sourceVector.begin());
+ }
+#endif
+ }
+
+ void Pipeline::toBson(BSONObjBuilder *pBuilder) const {
+ /* create an array out of the pipeline operations */
+ BSONArrayBuilder arrayBuilder;
+ for(SourceVector::const_iterator iter(sourceVector.begin()),
+ listEnd(sourceVector.end()); iter != listEnd; ++iter) {
+ intrusive_ptr<DocumentSource> pSource(*iter);
+ pSource->addToBsonArray(&arrayBuilder);
+ }
+
+ /* add the top-level items to the command */
+ pBuilder->append(commandName, getCollectionName());
+ pBuilder->append(pipelineName, arrayBuilder.arr());
+
+ bool btemp;
+ if ((btemp = getSplitMongodPipeline())) {
+ pBuilder->append(splitMongodPipelineName, btemp);
+ }
+ if ((btemp = pCtx->getInRouter())) {
+ pBuilder->append(fromRouterName, btemp);
+ }
+ }
+
+ bool Pipeline::run(BSONObjBuilder &result, string &errmsg,
+ intrusive_ptr<DocumentSource> pSource) {
+ /* chain together the sources we found */
+ for(SourceVector::iterator iter(sourceVector.begin()),
+ listEnd(sourceVector.end()); iter != listEnd; ++iter) {
+ intrusive_ptr<DocumentSource> pTemp(*iter);
+ pTemp->setSource(pSource);
+ pSource = pTemp;
+ }
+ /* pSource is left pointing at the last source in the chain */
+
+ /*
+ Iterate through the resulting documents, and add them to the result.
+ */
+ BSONArrayBuilder resultArray; // where we'll stash the results
+ for(bool hasDocument = !pSource->eof(); hasDocument;
+ hasDocument = pSource->advance()) {
+ boost::intrusive_ptr<Document> pDocument(pSource->getCurrent());
+
+ /* add the document to the result set */
+ BSONObjBuilder documentBuilder;
+ pDocument->toBson(&documentBuilder);
+ resultArray.append(documentBuilder.done());
+ }
+
+ result.appendArray("result", resultArray.arr());
+
+ return true;
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/commands/pipeline.h b/src/mongo/db/commands/pipeline.h
new file mode 100755
index 00000000000..ef9cc6afe51
--- /dev/null
+++ b/src/mongo/db/commands/pipeline.h
@@ -0,0 +1,183 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+#include "db/jsobj.h"
+#include "util/timer.h"
+#include "db/commands.h"
+
+namespace mongo {
+ class BSONObj;
+ class BSONObjBuilder;
+ class DocumentSource;
+ class DocumentSourceProject;
+ class Expression;
+ class ExpressionContext;
+ class ExpressionNary;
+ struct OpDesc; // local private struct
+
+ /** mongodb "commands" (sent via db.$cmd.findOne(...))
+ subclass to make a command. define a singleton object for it.
+ */
+ class Pipeline :
+ boost::noncopyable {
+ public:
+ virtual ~Pipeline();
+
+ /*
+ Create a pipeline from the command.
+
+ @param errmsg where to write errors, if there are any
+ @param cmdObj the command object sent from the client
+ @returns the pipeline, if created, otherwise a NULL reference
+ */
+ static boost::shared_ptr<Pipeline> parseCommand(
+ string &errmsg, BSONObj &cmdObj,
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ /*
+ Get the collection name from the command.
+
+ @returns the collection name
+ */
+ string getCollectionName() const;
+
+ /*
+ Split the current Pipeline into a Pipeline for each shard, and
+ a Pipeline that combines the results within mongos.
+
+ This permanently alters this pipeline for the merging operation.
+
+ @returns the Spec for the pipeline command that should be sent
+ to the shards
+ */
+ boost::shared_ptr<Pipeline> splitForSharded();
+
+ /*
+ Get Cursor creation modifiers.
+
+ If we have a $match or a $sort at the beginning of the pipeline,
+ these can be extracted and used to modify the cursor we'll use for
+ the initial collection scan.
+
+ If there is a Matcher query at the beginning of the pipeline,
+ get it, by adding its terms to the object under construction. If
+ not, this adds nothing to the object under construction.
+
+ If there is a sort at the beginning of the pipeline, get it, by
+ adding its terms to the object under construction. If not, this adds
+ nothing.
+
+ Optimization steps in parseCommand make sure that for any pairs
+ of adjacent matches and sorts, the match comes first. This ensures
+ that we sort a minimum of items, and doesn't change the result.
+ When getCursorMods() examines the pipeline, it looks for an initial
+ $match. If present, that is put into pQueryBuilder. If there is
+ a query, then the next stage is checked for a $sort, which will go
+ into pSortBuilder. If there is no initial $match, then a check is
+ made for an initial $sort, which will then still be put into
+ pSortBuilder.
+
+ As a side-effect, retrieving the Cursor modifications removes them
+ from the pipeline.
+
+ @param pQueryBuilder an initialized object builder
+ @param pSortBuilder an initialized object builder
+ */
+ void getCursorMods(BSONObjBuilder *pQueryBuilder,
+ BSONObjBuilder *pSortBuilder);
+
+ /*
+ Write the Pipeline as a BSONObj command. This should be the
+ inverse of parseCommand().
+
+ This is only intended to be used by the shard command obtained
+ from splitForSharded(). Some pipeline operations in the merge
+ process do not have equivalent command forms, and using this on
+ the mongos Pipeline will cause assertions.
+
+ @param the builder to write the command to
+ */
+ void toBson(BSONObjBuilder *pBuilder) const;
+
+ /*
+ Run the Pipeline on the given source.
+
+ @param result builder to write the result to
+ @param errmsg place to put error messages, if any
+ @param pSource the document source to use at the head of the chain
+ @returns true on success, false if an error occurs
+ */
+ bool run(BSONObjBuilder &result, string &errmsg,
+ intrusive_ptr<DocumentSource> pSource);
+
+ /*
+ Debugging: should the processing pipeline be split within
+ mongod, simulating the real mongos/mongod split? This is determined
+ by setting the splitMongodPipeline field in an "aggregate"
+ command.
+
+ The split itself is handled by the caller, which is currently
+ pipeline_command.cpp.
+
+ @returns true if the pipeline is to be split
+ */
+ bool getSplitMongodPipeline() const;
+
+ /*
+ The aggregation command name.
+ */
+ static const char commandName[];
+
+ private:
+ static const char pipelineName[];
+ static const char fromRouterName[];
+ static const char splitMongodPipelineName[];
+
+ Pipeline(const intrusive_ptr<ExpressionContext> &pCtx);
+
+ string collectionName;
+ typedef vector<intrusive_ptr<DocumentSource> > SourceVector;
+ SourceVector sourceVector;
+
+ bool splitMongodPipeline;
+ intrusive_ptr<ExpressionContext> pCtx;
+ };
+
+} // namespace mongo
+
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+ inline string Pipeline::getCollectionName() const {
+ return collectionName;
+ }
+
+ inline bool Pipeline::getSplitMongodPipeline() const {
+ if (!DEBUG_BUILD)
+ return false;
+
+ return splitMongodPipeline;
+ }
+
+} // namespace mongo
+
+
diff --git a/src/mongo/db/commands/pipeline_command.cpp b/src/mongo/db/commands/pipeline_command.cpp
new file mode 100755
index 00000000000..9863e14556c
--- /dev/null
+++ b/src/mongo/db/commands/pipeline_command.cpp
@@ -0,0 +1,187 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/commands/pipeline.h"
+#include "db/cursor.h"
+#include "db/pdfile.h"
+#include "db/pipeline/accumulator.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/document_source.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/expression_context.h"
+#include "db/queryoptimizer.h"
+
+namespace mongo {
+
+ /** mongodb "commands" (sent via db.$cmd.findOne(...))
+ subclass to make a command. define a singleton object for it.
+ */
+ class PipelineCommand :
+ public Command {
+ public:
+ // virtuals from Command
+ virtual ~PipelineCommand();
+ virtual bool run(const string &db, BSONObj &cmdObj, int options,
+ string &errmsg, BSONObjBuilder &result, bool fromRepl);
+ virtual LockType locktype() const;
+ virtual bool slaveOk() const;
+ virtual void help(stringstream &help) const;
+
+ PipelineCommand();
+ };
+
+ // self-registering singleton static instance
+ static PipelineCommand pipelineCommand;
+
+ PipelineCommand::PipelineCommand():
+ Command(Pipeline::commandName) {
+ }
+
+ Command::LockType PipelineCommand::locktype() const {
+ return READ;
+ }
+
+ bool PipelineCommand::slaveOk() const {
+ return true;
+ }
+
+ void PipelineCommand::help(stringstream &help) const {
+ help << "{ pipeline : [ { <data-pipe-op>: {...}}, ... ] }";
+ }
+
+ PipelineCommand::~PipelineCommand() {
+ }
+
+ bool PipelineCommand::run(const string &db, BSONObj &cmdObj,
+ int options, string &errmsg,
+ BSONObjBuilder &result, bool fromRepl) {
+
+ intrusive_ptr<ExpressionContext> pCtx(ExpressionContext::create());
+
+ /* try to parse the command; if this fails, then we didn't run */
+ boost::shared_ptr<Pipeline> pPipeline(
+ Pipeline::parseCommand(errmsg, cmdObj, pCtx));
+ if (!pPipeline.get())
+ return false;
+
+ /* get a query to use, if any */
+ BSONObjBuilder queryBuilder;
+ BSONObjBuilder sortBuilder;
+ pPipeline->getCursorMods(&queryBuilder, &sortBuilder);
+ BSONObj query(queryBuilder.done());
+ BSONObj sort(sortBuilder.done());
+
+ /* for debugging purposes, show what the query and sort are */
+ DEV {
+ (log() << "\n---- query BSON\n" <<
+ query.jsonString(Strict, 1) << "\n----\n").flush();
+ (log() << "\n---- sort BSON\n" <<
+ sort.jsonString(Strict, 1) << "\n----\n").flush();
+ }
+
+ /* create a cursor for that query */
+ string fullName(db + "." + pPipeline->getCollectionName());
+ shared_ptr<Cursor> pCursor(
+ NamespaceDetailsTransient::getCursor(
+ fullName.c_str(), query
+#ifdef MONGODB_SERVER3832 /* see https://jira.mongodb.org/browse/SERVER-3832 */
+ , sort
+#endif
+ ));
+
+ /* wrap the cursor with a DocumentSource */
+ intrusive_ptr<DocumentSource> pSource(
+ DocumentSourceCursor::create(pCursor));
+
+ /* this is the normal non-debug path */
+ if (!pPipeline->getSplitMongodPipeline())
+ return pPipeline->run(result, errmsg, pSource);
+
+ /* setup as if we're in the router */
+ pCtx->setInRouter(true);
+
+ /*
+ Here, we'll split the pipeline in the same way we would for sharding,
+ for testing purposes.
+
+ Run the shard pipeline first, then feed the results into the remains
+ of the existing pipeline.
+
+ Start by splitting the pipeline.
+ */
+ shared_ptr<Pipeline> pShardSplit(
+ pPipeline->splitForSharded());
+
+ /*
+ Write the split pipeline as we would in order to transmit it to
+ the shard servers.
+ */
+ BSONObjBuilder shardBuilder;
+ pShardSplit->toBson(&shardBuilder);
+ BSONObj shardBson(shardBuilder.done());
+
+ DEV (log() << "\n---- shardBson\n" <<
+ shardBson.jsonString(Strict, 1) << "\n----\n").flush();
+
+ /* for debugging purposes, show what the pipeline now looks like */
+ DEV {
+ BSONObjBuilder pipelineBuilder;
+ pPipeline->toBson(&pipelineBuilder);
+ BSONObj pipelineBson(pipelineBuilder.done());
+ (log() << "\n---- pipelineBson\n" <<
+ pipelineBson.jsonString(Strict, 1) << "\n----\n").flush();
+ }
+
+ /* on the shard servers, create the local pipeline */
+ intrusive_ptr<ExpressionContext> pShardCtx(ExpressionContext::create());
+ shared_ptr<Pipeline> pShardPipeline(
+ Pipeline::parseCommand(errmsg, shardBson, pShardCtx));
+ if (!pShardPipeline.get()) {
+ return false;
+ }
+
+ /* run the shard pipeline */
+ BSONObjBuilder shardResultBuilder;
+ string shardErrmsg;
+ pShardPipeline->run(shardResultBuilder, shardErrmsg, pSource);
+ BSONObj shardResult(shardResultBuilder.done());
+
+ /* pick out the shard result, and prepare to read it */
+ intrusive_ptr<DocumentSourceBsonArray> pShardSource;
+ BSONObjIterator shardIter(shardResult);
+ while(shardIter.more()) {
+ BSONElement shardElement(shardIter.next());
+ const char *pFieldName = shardElement.fieldName();
+
+ if (strcmp(pFieldName, "result") == 0) {
+ pShardSource = DocumentSourceBsonArray::create(&shardElement);
+
+ /*
+ Connect the output of the shard pipeline with the mongos
+ pipeline that will merge the results.
+ */
+ return pPipeline->run(result, errmsg, pShardSource);
+ }
+ }
+
+ /* NOTREACHED */
+ assert(false);
+ return false;
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/common.cpp b/src/mongo/db/common.cpp
new file mode 100644
index 00000000000..cd073f8b059
--- /dev/null
+++ b/src/mongo/db/common.cpp
@@ -0,0 +1,73 @@
+/** @file common.cpp
+ Common code for server binaries (mongos, mongod, test).
+ Nothing used by driver should be here.
+ */
+
+/*
+ * Copyright (C) 2010 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+//#include "pch.h"
+//#include "concurrency.h"
+#include "jsobjmanipulator.h"
+
+/**
+ * this just has globals
+ */
+namespace mongo {
+
+ /** called by mongos, mongod, test. do not call from clients and such.
+ invoked before about everything except global var construction.
+ */
+ void doPreServerStartupInits() {
+#if defined(RLIMIT_NPROC) && defined(RLIMIT_NOFILE)
+ //Check that # of files rlmit > 1000 , and # of processes > # of files/2
+ const unsigned int minNumFiles = 1000;
+ const double filesToProcsRatio = 2.0;
+ struct rlimit rlnproc;
+ struct rlimit rlnofile;
+
+ if(!getrlimit(RLIMIT_NPROC,&rlnproc) && !getrlimit(RLIMIT_NOFILE,&rlnofile)){
+ if(rlnofile.rlim_cur < minNumFiles){
+ log() << "Warning: soft rlimits too low. Number of files is " << rlnofile.rlim_cur << ", should be at least " << minNumFiles << endl;
+ }
+ if(rlnproc.rlim_cur < rlnofile.rlim_cur/filesToProcsRatio){
+ log() << "Warning: soft rlimits too low. " << rlnproc.rlim_cur << " processes, " << rlnofile.rlim_cur << " files. Number of processes should be at least "<< 1/filesToProcsRatio << " times number of files." << endl;
+ }
+ }
+ else{
+ log() << "Warning: getrlimit failed" << endl;
+ }
+#endif
+ }
+
+ NOINLINE_DECL OpTime OpTime::skewed() {
+ bool toLog = false;
+ ONCE toLog = true;
+ RARELY toLog = true;
+ last.i++;
+ if ( last.i & 0x80000000 )
+ toLog = true;
+ if ( toLog ) {
+ log() << "clock skew detected prev: " << last.secs << " now: " << (unsigned) time(0) << endl;
+ }
+ if ( last.i & 0x80000000 ) {
+ log() << "error large clock skew detected, shutting down" << endl;
+ throw ClockSkewException();
+ }
+ return last;
+ }
+
+}
diff --git a/src/mongo/db/compact.cpp b/src/mongo/db/compact.cpp
new file mode 100644
index 00000000000..32931b6c5fd
--- /dev/null
+++ b/src/mongo/db/compact.cpp
@@ -0,0 +1,376 @@
+/** @file compact.cpp
+ compaction of deleted space in pdfiles (datafiles)
+*/
+
+/**
+* Copyright (C) 2010 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,b
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "pdfile.h"
+#include "concurrency.h"
+#include "commands.h"
+#include "curop-inl.h"
+#include "background.h"
+#include "extsort.h"
+#include "compact.h"
+#include "../util/concurrency/task.h"
+#include "../util/timer.h"
+
+namespace mongo {
+
+ char faux;
+
+ void addRecordToRecListInExtent(Record *r, DiskLoc loc);
+ DiskLoc allocateSpaceForANewRecord(const char *ns, NamespaceDetails *d, int lenWHdr, bool god);
+ void freeExtents(DiskLoc firstExt, DiskLoc lastExt);
+
+ /* this should be done in alloc record not here, but doing here for now.
+ really dumb; it's a start.
+ */
+ unsigned quantizeMask(unsigned x) {
+ if( x > 4096 * 20 )
+ return ~4095;
+ if( x >= 512 )
+ return ~63;
+ return ~0;
+ }
+
+ /** @return number of skipped (invalid) documents */
+ unsigned compactExtent(const char *ns, NamespaceDetails *d, const DiskLoc ext, int n,
+ const scoped_array<IndexSpec> &indexSpecs,
+ scoped_array<SortPhaseOne>& phase1, int nidx, bool validate,
+ double pf, int pb)
+ {
+ log() << "compact extent #" << n << endl;
+ unsigned oldObjSize = 0; // we'll report what the old padding was
+ unsigned oldObjSizeWithPadding = 0;
+
+ Extent *e = ext.ext();
+ e->assertOk();
+ assert( e->validates() );
+ unsigned skipped = 0;
+
+ {
+ // the next/prev pointers within the extent might not be in order so we first page the whole thing in
+ // sequentially
+ log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl;
+ Timer t;
+ MAdvise adv(e, e->length, MAdvise::Sequential);
+ const char *p = (const char *) e;
+ for( int i = 0; i < e->length; i += 4096 ) {
+ faux += p[i];
+ }
+ int ms = t.millis();
+ if( ms > 1000 )
+ log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/ms << "MB/sec" << endl;
+ }
+
+ {
+ log() << "compact copying records" << endl;
+ unsigned totalSize = 0;
+ int nrecs = 0;
+ DiskLoc L = e->firstRecord;
+ if( !L.isNull() ) {
+ while( 1 ) {
+ Record *recOld = L.rec();
+ L = recOld->nextInExtent(L);
+ nrecs++;
+ BSONObj objOld(recOld);
+
+ if( !validate || objOld.valid() ) {
+ unsigned sz = objOld.objsize();
+
+ oldObjSize += sz;
+ oldObjSizeWithPadding += recOld->netLength();
+
+ unsigned lenWHdr = sz + Record::HeaderSize;
+ unsigned lenWPadding = lenWHdr;
+ {
+ lenWPadding = static_cast<unsigned>(pf*lenWPadding);
+ lenWPadding += pb;
+ lenWPadding = lenWPadding & quantizeMask(lenWPadding);
+ if( lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) {
+ lenWPadding = lenWHdr;
+ }
+ }
+ totalSize += lenWPadding;
+ DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWPadding, false);
+ uassert(14024, "compact error out of space during compaction", !loc.isNull());
+ Record *recNew = loc.rec();
+ recNew = (Record *) getDur().writingPtr(recNew, lenWHdr);
+ addRecordToRecListInExtent(recNew, loc);
+ memcpy(recNew->data, objOld.objdata(), sz);
+
+ {
+ // extract keys for all indexes we will be rebuilding
+ for( int x = 0; x < nidx; x++ ) {
+ phase1[x].addKeys(indexSpecs[x], objOld, loc);
+ }
+ }
+ }
+ else {
+ if( ++skipped <= 10 )
+ log() << "compact skipping invalid object" << endl;
+ }
+
+ if( L.isNull() ) {
+ // we just did the very last record from the old extent. it's still pointed to
+ // by the old extent ext, but that will be fixed below after this loop
+ break;
+ }
+
+ // remove the old records (orphan them) periodically so our commit block doesn't get too large
+ bool stopping = false;
+ RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0;
+ if( stopping || getDur().aCommitIsNeeded() ) {
+ e->firstRecord.writing() = L;
+ Record *r = L.rec();
+ getDur().writingInt(r->prevOfs) = DiskLoc::NullOfs;
+ getDur().commitIfNeeded();
+ killCurrentOp.checkForInterrupt(false);
+ }
+ }
+ } // if !L.isNull()
+
+ assert( d->firstExtent == ext );
+ assert( d->lastExtent != ext );
+ DiskLoc newFirst = e->xnext;
+ d->firstExtent.writing() = newFirst;
+ newFirst.ext()->xprev.writing().Null();
+ getDur().writing(e)->markEmpty();
+ freeExtents(ext,ext);
+ getDur().commitIfNeeded();
+
+ {
+ double op = 1.0;
+ if( oldObjSize )
+ op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize;
+ log() << "compact " << nrecs << " documents " << totalSize/1000000.0 << "MB"
+ << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100
+ << endl;
+ }
+ }
+
+ return skipped;
+ }
+
+ extern SortPhaseOne *precalced;
+
+ bool _compact(const char *ns, NamespaceDetails *d, string& errmsg, bool validate, BSONObjBuilder& result, double pf, int pb) {
+ //int les = d->lastExtentSize;
+
+ // this is a big job, so might as well make things tidy before we start just to be nice.
+ getDur().commitNow();
+
+ list<DiskLoc> extents;
+ for( DiskLoc L = d->firstExtent; !L.isNull(); L = L.ext()->xnext )
+ extents.push_back(L);
+ log() << "compact " << extents.size() << " extents" << endl;
+
+ ProgressMeterHolder pm( cc().curop()->setMessage( "compact extent" , extents.size() ) );
+
+ // same data, but might perform a little different after compact?
+ NamespaceDetailsTransient::get(ns).clearQueryCache();
+
+ int nidx = d->nIndexes;
+ scoped_array<IndexSpec> indexSpecs( new IndexSpec[nidx] );
+ scoped_array<SortPhaseOne> phase1( new SortPhaseOne[nidx] );
+ {
+ NamespaceDetails::IndexIterator ii = d->ii();
+ int x = 0;
+ while( ii.more() ) {
+ BSONObjBuilder b;
+ IndexDetails& idx = ii.next();
+ BSONObj::iterator i(idx.info.obj());
+ while( i.more() ) {
+ BSONElement e = i.next();
+ if( !str::equals(e.fieldName(), "v") && !str::equals(e.fieldName(), "background") ) {
+ b.append(e);
+ }
+ }
+ BSONObj o = b.obj().getOwned();
+ phase1[x].sorter.reset( new BSONObjExternalSorter( idx.idxInterface(), o.getObjectField("key") ) );
+ phase1[x].sorter->hintNumObjects( d->stats.nrecords );
+ indexSpecs[x++].reset(o);
+ }
+ }
+
+ log() << "compact orphan deleted lists" << endl;
+ for( int i = 0; i < Buckets; i++ ) {
+ d->deletedList[i].writing().Null();
+ }
+
+
+
+ // Start over from scratch with our extent sizing and growth
+ d->lastExtentSize=0;
+
+ // before dropping indexes, at least make sure we can allocate one extent!
+ uassert(14025, "compact error no space available to allocate", !allocateSpaceForANewRecord(ns, d, Record::HeaderSize+1, false).isNull());
+
+ // note that the drop indexes call also invalidates all clientcursors for the namespace, which is important and wanted here
+ log() << "compact dropping indexes" << endl;
+ BSONObjBuilder b;
+ if( !dropIndexes(d, ns, "*", errmsg, b, true) ) {
+ errmsg = "compact drop indexes failed";
+ log() << errmsg << endl;
+ return false;
+ }
+
+ getDur().commitNow();
+
+ long long skipped = 0;
+ int n = 0;
+ for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) {
+ skipped += compactExtent(ns, d, *i, n++, indexSpecs, phase1, nidx, validate, pf, pb);
+ pm.hit();
+ }
+
+ if( skipped ) {
+ result.append("invalidObjects", skipped);
+ }
+
+ assert( d->firstExtent.ext()->xprev.isNull() );
+
+ // indexes will do their own progress meter?
+ pm.finished();
+
+ // build indexes
+ NamespaceString s(ns);
+ string si = s.db + ".system.indexes";
+ for( int i = 0; i < nidx; i++ ) {
+ killCurrentOp.checkForInterrupt(false);
+ BSONObj info = indexSpecs[i].info;
+ log() << "compact create index " << info["key"].Obj().toString() << endl;
+ try {
+ precalced = &phase1[i];
+ theDataFileMgr.insert(si.c_str(), info.objdata(), info.objsize());
+ }
+ catch(...) {
+ precalced = 0;
+ throw;
+ }
+ precalced = 0;
+ }
+
+ return true;
+ }
+
+ bool compact(const string& ns, string &errmsg, bool validate, BSONObjBuilder& result, double pf, int pb) {
+ massert( 14028, "bad ns", NamespaceString::normal(ns.c_str()) );
+ massert( 14027, "can't compact a system namespace", !str::contains(ns, ".system.") ); // items in system.indexes cannot be moved there are pointers to those disklocs in NamespaceDetails
+
+ bool ok;
+ {
+ writelock lk;
+ BackgroundOperation::assertNoBgOpInProgForNs(ns.c_str());
+ Client::Context ctx(ns);
+ NamespaceDetails *d = nsdetails(ns.c_str());
+ massert( 13660, str::stream() << "namespace " << ns << " does not exist", d );
+ massert( 13661, "cannot compact capped collection", !d->capped );
+ log() << "compact " << ns << " begin" << endl;
+ if( pf != 0 || pb != 0 ) {
+ log() << "paddingFactor:" << pf << " paddingBytes:" << pb << endl;
+ }
+ try {
+ ok = _compact(ns.c_str(), d, errmsg, validate, result, pf, pb);
+ }
+ catch(...) {
+ log() << "compact " << ns << " end (with error)" << endl;
+ throw;
+ }
+ log() << "compact " << ns << " end" << endl;
+ }
+ return ok;
+ }
+
+ bool isCurrentlyAReplSetPrimary();
+
+ class CompactCmd : public Command {
+ public:
+ virtual LockType locktype() const { return NONE; }
+ virtual bool adminOnly() const { return false; }
+ virtual bool slaveOk() const { return true; }
+ virtual bool maintenanceMode() const { return true; }
+ virtual bool logTheOp() { return false; }
+ virtual void help( stringstream& help ) const {
+ help << "compact collection\n"
+ "warning: this operation blocks the server and is slow. you can cancel with cancelOp()\n"
+ "{ compact : <collection_name>, [force:true], [validate:true] }\n"
+ " force - allows to run on a replica set primary\n"
+ " validate - check records are noncorrupt before adding to newly compacting extents. slower but safer (default is true in this version)\n";
+ }
+ virtual bool requiresAuth() { return true; }
+ CompactCmd() : Command("compact") { }
+
+ virtual bool run(const string& db, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ string coll = cmdObj.firstElement().valuestr();
+ if( coll.empty() || db.empty() ) {
+ errmsg = "no collection name specified";
+ return false;
+ }
+
+ if( isCurrentlyAReplSetPrimary() && !cmdObj["force"].trueValue() ) {
+ errmsg = "will not run compact on an active replica set primary as this is a slow blocking operation. use force:true to force";
+ return false;
+ }
+
+ string ns = db + '.' + coll;
+ if ( ! NamespaceString::normal(ns.c_str()) ) {
+ errmsg = "bad namespace name";
+ return false;
+ }
+
+ // parameter validation to avoid triggering assertions in compact()
+ if ( str::contains(ns, ".system.") ) {
+ errmsg = "can't compact a system namespace";
+ return false;
+ }
+
+ {
+ writelock lk;
+ Client::Context ctx(ns);
+ NamespaceDetails *d = nsdetails(ns.c_str());
+ if( ! d ) {
+ errmsg = "namespace does not exist";
+ return false;
+ }
+
+ if ( d->capped ) {
+ errmsg = "cannot compact a capped collection";
+ return false;
+ }
+ }
+
+ double pf = 1.0;
+ int pb = 0;
+ if( cmdObj.hasElement("paddingFactor") ) {
+ pf = cmdObj["paddingFactor"].Number();
+ assert( pf >= 1.0 && pf <= 4.0 );
+ }
+ if( cmdObj.hasElement("paddingBytes") ) {
+ pb = (int) cmdObj["paddingBytes"].Number();
+ assert( pb >= 0 && pb <= 1024 * 1024 );
+ }
+
+ bool validate = !cmdObj.hasElement("validate") || cmdObj["validate"].trueValue(); // default is true at the moment
+ bool ok = compact(ns, errmsg, validate, result, pf, pb);
+ return ok;
+ }
+ };
+ static CompactCmd compactCmd;
+
+}
diff --git a/src/mongo/db/compact.h b/src/mongo/db/compact.h
new file mode 100644
index 00000000000..7bf49c8e1b8
--- /dev/null
+++ b/src/mongo/db/compact.h
@@ -0,0 +1,50 @@
+// compact.h
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+
+ /** for bottom up fastbuildindex (where we presort keys) */
+ struct SortPhaseOne {
+ SortPhaseOne() {
+ n = 0;
+ nkeys = 0;
+ multi = false;
+ }
+ shared_ptr<BSONObjExternalSorter> sorter;
+ unsigned long long n; // # of records
+ unsigned long long nkeys;
+ bool multi; // multikey index
+
+ void addKeys(const IndexSpec& spec, const BSONObj& o, DiskLoc loc) {
+ BSONObjSet keys;
+ spec.getKeys(o, keys);
+ int k = 0;
+ for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) {
+ if( ++k == 2 ) {
+ multi = true;
+ }
+ sorter->add(*i, loc);
+ nkeys++;
+ }
+ n++;
+ }
+ };
+
+}
diff --git a/src/mongo/db/concurrency.h b/src/mongo/db/concurrency.h
new file mode 100644
index 00000000000..33bc0caac77
--- /dev/null
+++ b/src/mongo/db/concurrency.h
@@ -0,0 +1,21 @@
+// @file concurrency.h
+
+/*
+ * Copyright (C) 2010 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "mongomutex.h"
diff --git a/src/mongo/db/curop-inl.h b/src/mongo/db/curop-inl.h
new file mode 100644
index 00000000000..7dd678b185d
--- /dev/null
+++ b/src/mongo/db/curop-inl.h
@@ -0,0 +1 @@
+#include "curop.h"
diff --git a/src/mongo/db/curop.cpp b/src/mongo/db/curop.cpp
new file mode 100644
index 00000000000..3cc452b46cc
--- /dev/null
+++ b/src/mongo/db/curop.cpp
@@ -0,0 +1,173 @@
+/**
+* Copyright (C) 2009 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "curop.h"
+#include "database.h"
+
+namespace mongo {
+
+ // todo : move more here
+
+ CurOp::CurOp( Client * client , CurOp * wrapped ) :
+ _client(client),
+ _wrapped(wrapped)
+ {
+ if ( _wrapped )
+ _client->_curOp = this;
+ _start = _checkpoint = 0;
+ _active = false;
+ _reset();
+ _op = 0;
+ // These addresses should never be written to again. The zeroes are
+ // placed here as a precaution because currentOp may be accessed
+ // without the db mutex.
+ memset(_ns, 0, sizeof(_ns));
+ }
+
+ void CurOp::_reset() {
+ _command = false;
+ _lockType = 0;
+ _dbprofile = 0;
+ _end = 0;
+ _waitingForLock = false;
+ _message = "";
+ _progressMeter.finished();
+ _killed = false;
+ _numYields = 0;
+ }
+
+ void CurOp::reset() {
+ _reset();
+ _start = _checkpoint = 0;
+ _opNum = _nextOpNum++;
+ _ns[0] = 0;
+ _debug.reset();
+ _query.reset();
+ _active = true; // this should be last for ui clarity
+ }
+
+ void CurOp::reset( const HostAndPort& remote, int op ) {
+ reset();
+ if( _remote != remote ) {
+ // todo : _remote is not thread safe yet is used as such!
+ _remote = remote;
+ }
+ _op = op;
+ }
+
+ ProgressMeter& CurOp::setMessage( const char * msg , unsigned long long progressMeterTotal , int secondsBetween ) {
+ if ( progressMeterTotal ) {
+ if ( _progressMeter.isActive() ) {
+ cout << "about to assert, old _message: " << _message << " new message:" << msg << endl;
+ assert( ! _progressMeter.isActive() );
+ }
+ _progressMeter.reset( progressMeterTotal , secondsBetween );
+ }
+ else {
+ _progressMeter.finished();
+ }
+ _message = msg;
+ return _progressMeter;
+ }
+
+
+ BSONObj CurOp::info() {
+ if( ! cc().getAuthenticationInfo()->isAuthorized("admin") ) {
+ BSONObjBuilder b;
+ b.append("err", "unauthorized");
+ return b.obj();
+ }
+ return infoNoauth();
+ }
+
+ CurOp::~CurOp() {
+ if ( _wrapped ) {
+ scoped_lock bl(Client::clientsMutex);
+ _client->_curOp = _wrapped;
+ }
+ _client = 0;
+ }
+
+ void CurOp::enter( Client::Context * context ) {
+ ensureStarted();
+ setNS( context->ns() );
+ _dbprofile = context->_db ? context->_db->profile : 0;
+ }
+
+ void CurOp::leave( Client::Context * context ) {
+ unsigned long long now = curTimeMicros64();
+ Top::global.record( _ns , _op , _lockType , now - _checkpoint , _command );
+ _checkpoint = now;
+ }
+
+ BSONObj CurOp::infoNoauth() {
+ BSONObjBuilder b;
+ b.append("opid", _opNum);
+ bool a = _active && _start;
+ b.append("active", a);
+ if ( _lockType )
+ b.append("lockType" , _lockType > 0 ? "write" : "read" );
+ b.append("waitingForLock" , _waitingForLock );
+
+ if( a ) {
+ b.append("secs_running", elapsedSeconds() );
+ }
+
+ b.append( "op" , opToString( _op ) );
+
+ b.append("ns", _ns);
+
+ _query.append( b , "query" );
+
+ if( !_remote.empty() ) {
+ b.append("client", _remote.toString());
+ }
+
+ if ( _client ) {
+ b.append( "desc" , _client->desc() );
+ if ( _client->_threadId.size() )
+ b.append( "threadId" , _client->_threadId );
+ if ( _client->_connectionId )
+ b.appendNumber( "connectionId" , _client->_connectionId );
+ }
+
+ if ( ! _message.empty() ) {
+ if ( _progressMeter.isActive() ) {
+ StringBuilder buf(128);
+ buf << _message.toString() << " " << _progressMeter.toString();
+ b.append( "msg" , buf.str() );
+ BSONObjBuilder sub( b.subobjStart( "progress" ) );
+ sub.appendNumber( "done" , (long long)_progressMeter.done() );
+ sub.appendNumber( "total" , (long long)_progressMeter.total() );
+ sub.done();
+ }
+ else {
+ b.append( "msg" , _message.toString() );
+ }
+ }
+
+ if( killed() )
+ b.append("killed", true);
+
+ b.append( "numYields" , _numYields );
+
+ return b.obj();
+ }
+
+ AtomicUInt CurOp::_nextOpNum;
+
+}
diff --git a/src/mongo/db/curop.h b/src/mongo/db/curop.h
new file mode 100644
index 00000000000..192404d8796
--- /dev/null
+++ b/src/mongo/db/curop.h
@@ -0,0 +1,313 @@
+// @file curop.h
+
+/*
+ * Copyright (C) 2010 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#pragma once
+
+#include "namespace-inl.h"
+#include "client.h"
+#include "../bson/util/atomic_int.h"
+#include "../util/concurrency/spin_lock.h"
+#include "../util/time_support.h"
+#include "../util/net/hostandport.h"
+
+namespace mongo {
+
+ class CurOp;
+
+ /* lifespan is different than CurOp because of recursives with DBDirectClient */
+ class OpDebug {
+ public:
+ OpDebug() : ns(""){ reset(); }
+
+ void reset();
+
+ string toString() const;
+ void append( const CurOp& curop, BSONObjBuilder& b ) const;
+
+ // -------------------
+
+ StringBuilder extra; // weird things we need to fix later
+
+ // basic options
+ int op;
+ bool iscommand;
+ Namespace ns;
+ BSONObj query;
+ BSONObj updateobj;
+
+ // detailed options
+ long long cursorid;
+ int ntoreturn;
+ int ntoskip;
+ bool exhaust;
+
+ // debugging/profile info
+ int nscanned;
+ bool idhack; // indicates short circuited code path on an update to make the update faster
+ bool scanAndOrder; // scanandorder query plan aspect was used
+ bool moved; // update resulted in a move (moves are expensive)
+ bool fastmod;
+ bool fastmodinsert; // upsert of an $operation. builds a default object
+ bool upsert; // true if the update actually did an insert
+ int keyUpdates;
+
+ // error handling
+ ExceptionInfo exceptionInfo;
+
+ // response info
+ int executionTime;
+ int nreturned;
+ int responseLength;
+ };
+
+ /**
+ * stores a copy of a bson obj in a fixed size buffer
+ * if its too big for the buffer, says "too big"
+ * useful for keeping a copy around indefinitely without wasting a lot of space or doing malloc
+ */
+ class CachedBSONObj {
+ public:
+ enum { TOO_BIG_SENTINEL = 1 } ;
+ static BSONObj _tooBig; // { $msg : "query not recording (too large)" }
+
+ CachedBSONObj() {
+ _size = (int*)_buf;
+ reset();
+ }
+
+ void reset( int sz = 0 ) {
+ _lock.lock();
+ _reset( sz );
+ _lock.unlock();
+ }
+
+ void set( const BSONObj& o ) {
+ scoped_spinlock lk(_lock);
+ int sz = o.objsize();
+ if ( sz > (int) sizeof(_buf) ) {
+ _reset(TOO_BIG_SENTINEL);
+ }
+ else {
+ memcpy(_buf, o.objdata(), sz );
+ }
+ }
+
+ int size() const { return *_size; }
+ bool have() const { return size() > 0; }
+
+ BSONObj get() const {
+ scoped_spinlock lk(_lock);
+ return _get();
+ }
+
+ void append( BSONObjBuilder& b , const StringData& name ) const {
+ scoped_spinlock lk(_lock);
+ BSONObj temp = _get();
+ b.append( name , temp );
+ }
+
+ private:
+ /** you have to be locked when you call this */
+ BSONObj _get() const {
+ int sz = size();
+ if ( sz == 0 )
+ return BSONObj();
+ if ( sz == TOO_BIG_SENTINEL )
+ return _tooBig;
+ return BSONObj( _buf ).copy();
+ }
+
+ /** you have to be locked when you call this */
+ void _reset( int sz ) { _size[0] = sz; }
+
+ mutable SpinLock _lock;
+ int * _size;
+ char _buf[512];
+ };
+
+ /* Current operation (for the current Client).
+ an embedded member of Client class, and typically used from within the mutex there.
+ */
+ class CurOp : boost::noncopyable {
+ public:
+ CurOp( Client * client , CurOp * wrapped = 0 );
+ ~CurOp();
+
+ bool haveQuery() const { return _query.have(); }
+ BSONObj query() { return _query.get(); }
+ void appendQuery( BSONObjBuilder& b , const StringData& name ) const { _query.append( b , name ); }
+
+ void ensureStarted() {
+ if ( _start == 0 )
+ _start = _checkpoint = curTimeMicros64();
+ }
+ bool isStarted() const { return _start > 0; }
+ void enter( Client::Context * context );
+ void leave( Client::Context * context );
+ void reset();
+ void reset( const HostAndPort& remote, int op );
+ void markCommand() { _command = true; }
+
+ void waitingForLock( int type ) {
+ _waitingForLock = true;
+ if ( type > 0 )
+ _lockType = 1;
+ else
+ _lockType = -1;
+ }
+ void gotLock() { _waitingForLock = false; }
+ OpDebug& debug() { return _debug; }
+ int profileLevel() const { return _dbprofile; }
+ const char * getNS() const { return _ns; }
+
+ bool shouldDBProfile( int ms ) const {
+ if ( _dbprofile <= 0 )
+ return false;
+
+ return _dbprofile >= 2 || ms >= cmdLine.slowMS;
+ }
+
+ AtomicUInt opNum() const { return _opNum; }
+
+ /** if this op is running */
+ bool active() const { return _active; }
+
+ int getLockType() const { return _lockType; }
+ bool isWaitingForLock() const { return _waitingForLock; }
+ int getOp() const { return _op; }
+ unsigned long long startTime() { // micros
+ ensureStarted();
+ return _start;
+ }
+ void done() {
+ _active = false;
+ _end = curTimeMicros64();
+ }
+ unsigned long long totalTimeMicros() {
+ massert( 12601 , "CurOp not marked done yet" , ! _active );
+ return _end - startTime();
+ }
+ int totalTimeMillis() { return (int) (totalTimeMicros() / 1000); }
+ int elapsedMillis() {
+ unsigned long long total = curTimeMicros64() - startTime();
+ return (int) (total / 1000);
+ }
+ int elapsedSeconds() { return elapsedMillis() / 1000; }
+ void setQuery(const BSONObj& query) { _query.set( query ); }
+ Client * getClient() const { return _client; }
+ BSONObj info();
+ BSONObj infoNoauth();
+ string getRemoteString( bool includePort = true ) { return _remote.toString(includePort); }
+ ProgressMeter& setMessage( const char * msg , unsigned long long progressMeterTotal = 0 , int secondsBetween = 3 );
+ string getMessage() const { return _message.toString(); }
+ ProgressMeter& getProgressMeter() { return _progressMeter; }
+ CurOp *parent() const { return _wrapped; }
+ void kill() { _killed = true; }
+ bool killed() const { return _killed; }
+ void yielded() { _numYields++; }
+ void setNS(const char *ns) {
+ strncpy(_ns, ns, Namespace::MaxNsLen);
+ _ns[Namespace::MaxNsLen] = 0;
+ }
+
+ private:
+ friend class Client;
+ void _reset();
+
+ static AtomicUInt _nextOpNum;
+ Client * _client;
+ CurOp * _wrapped;
+ unsigned long long _start;
+ unsigned long long _checkpoint;
+ unsigned long long _end;
+ bool _active;
+ int _op;
+ bool _command;
+ int _lockType; // see concurrency.h for values
+ bool _waitingForLock;
+ int _dbprofile; // 0=off, 1=slow, 2=all
+ AtomicUInt _opNum; // todo: simple being "unsigned" may make more sense here
+ char _ns[Namespace::MaxNsLen+2];
+ HostAndPort _remote; // CAREFUL here with thread safety
+ CachedBSONObj _query; // CachedBSONObj is thread safe
+ OpDebug _debug;
+ ThreadSafeString _message;
+ ProgressMeter _progressMeter;
+ volatile bool _killed;
+ int _numYields;
+ };
+
+ /* _globalKill: we are shutting down
+ otherwise kill attribute set on specified CurOp
+ this class does not handle races between interruptJs and the checkForInterrupt functions - those must be
+ handled by the client of this class
+ */
+ extern class KillCurrentOp {
+ public:
+ void killAll();
+ void kill(AtomicUInt i);
+
+ /** @return true if global interrupt and should terminate the operation */
+ bool globalInterruptCheck() const { return _globalKill; }
+
+ void checkForInterrupt( bool heedMutex = true ) {
+ Client& c = cc();
+ if ( heedMutex && d.dbMutex.isWriteLocked() )
+ return;
+ if( _globalKill )
+ uasserted(11600,"interrupted at shutdown");
+ if( c.curop()->killed() )
+ uasserted(11601,"interrupted");
+ if( c.sometimes(1024) ) {
+ AbstractMessagingPort *p = cc().port();
+ if( p )
+ p->assertStillConnected();
+ }
+ }
+
+ /** @return "" if not interrupted. otherwise, you should stop. */
+ const char *checkForInterruptNoAssert( /*bool heedMutex = true*/ ) {
+ Client& c = cc();
+ // always called withi false so commented out:
+ /*if ( heedMutex && d.dbMutex.isWriteLocked() )
+ return "";*/
+ if( _globalKill )
+ return "interrupted at shutdown";
+ if( c.curop()->killed() )
+ return "interrupted";
+ if( c.sometimes(1024) ) {
+ try {
+ AbstractMessagingPort *p = cc().port();
+ if( p )
+ p->assertStillConnected();
+ }
+ catch(...) {
+ log() << "no longer connected to client";
+ return "no longer connected to client";
+ }
+ }
+ return "";
+ }
+
+ private:
+ void interruptJs( AtomicUInt *op );
+ volatile bool _globalKill;
+ } killCurrentOp;
+
+}
diff --git a/src/mongo/db/cursor.cpp b/src/mongo/db/cursor.cpp
new file mode 100644
index 00000000000..ac7afc1532b
--- /dev/null
+++ b/src/mongo/db/cursor.cpp
@@ -0,0 +1,166 @@
+/**
+ * Copyright (C) 2008 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "pdfile.h"
+#include "curop-inl.h"
+
+namespace mongo {
+
+ bool BasicCursor::advance() {
+ killCurrentOp.checkForInterrupt();
+ if ( eof() ) {
+ if ( tailable_ && !last.isNull() ) {
+ curr = s->next( last );
+ }
+ else {
+ return false;
+ }
+ }
+ else {
+ last = curr;
+ curr = s->next( curr );
+ }
+ incNscanned();
+ return ok();
+ }
+
+ /* these will be used outside of mutexes - really functors - thus the const */
+ class Forward : public AdvanceStrategy {
+ virtual DiskLoc next( const DiskLoc &prev ) const {
+ return prev.rec()->getNext( prev );
+ }
+ } _forward;
+
+ class Reverse : public AdvanceStrategy {
+ virtual DiskLoc next( const DiskLoc &prev ) const {
+ return prev.rec()->getPrev( prev );
+ }
+ } _reverse;
+
+ const AdvanceStrategy *forward() {
+ return &_forward;
+ }
+ const AdvanceStrategy *reverse() {
+ return &_reverse;
+ }
+
+ DiskLoc nextLoop( NamespaceDetails *nsd, const DiskLoc &prev ) {
+ assert( nsd->capLooped() );
+ DiskLoc next = forward()->next( prev );
+ if ( !next.isNull() )
+ return next;
+ return nsd->firstRecord();
+ }
+
+ DiskLoc prevLoop( NamespaceDetails *nsd, const DiskLoc &curr ) {
+ assert( nsd->capLooped() );
+ DiskLoc prev = reverse()->next( curr );
+ if ( !prev.isNull() )
+ return prev;
+ return nsd->lastRecord();
+ }
+
+ ForwardCappedCursor::ForwardCappedCursor( NamespaceDetails *_nsd, const DiskLoc &startLoc ) :
+ nsd( _nsd ) {
+ if ( !nsd )
+ return;
+ DiskLoc start = startLoc;
+ if ( start.isNull() ) {
+ if ( !nsd->capLooped() )
+ start = nsd->firstRecord();
+ else {
+ start = nsd->capExtent.ext()->firstRecord;
+ if ( !start.isNull() && start == nsd->capFirstNewRecord ) {
+ start = nsd->capExtent.ext()->lastRecord;
+ start = nextLoop( nsd, start );
+ }
+ }
+ }
+ curr = start;
+ s = this;
+ incNscanned();
+ }
+
+ DiskLoc ForwardCappedCursor::next( const DiskLoc &prev ) const {
+ assert( nsd );
+ if ( !nsd->capLooped() )
+ return forward()->next( prev );
+
+ DiskLoc i = prev;
+ // Last record
+ if ( i == nsd->capExtent.ext()->lastRecord )
+ return DiskLoc();
+ i = nextLoop( nsd, i );
+ // If we become capFirstNewRecord from same extent, advance to next extent.
+ if ( i == nsd->capFirstNewRecord &&
+ i != nsd->capExtent.ext()->firstRecord )
+ i = nextLoop( nsd, nsd->capExtent.ext()->lastRecord );
+ // If we have just gotten to beginning of capExtent, skip to capFirstNewRecord
+ if ( i == nsd->capExtent.ext()->firstRecord )
+ i = nsd->capFirstNewRecord;
+ return i;
+ }
+
+ ReverseCappedCursor::ReverseCappedCursor( NamespaceDetails *_nsd, const DiskLoc &startLoc ) :
+ nsd( _nsd ) {
+ if ( !nsd )
+ return;
+ DiskLoc start = startLoc;
+ if ( start.isNull() ) {
+ if ( !nsd->capLooped() ) {
+ start = nsd->lastRecord();
+ }
+ else {
+ start = nsd->capExtent.ext()->lastRecord;
+ }
+ }
+ curr = start;
+ s = this;
+ incNscanned();
+ }
+
+ DiskLoc ReverseCappedCursor::next( const DiskLoc &prev ) const {
+ assert( nsd );
+ if ( !nsd->capLooped() )
+ return reverse()->next( prev );
+
+ DiskLoc i = prev;
+ // Last record
+ if ( nsd->capFirstNewRecord == nsd->capExtent.ext()->firstRecord ) {
+ if ( i == nextLoop( nsd, nsd->capExtent.ext()->lastRecord ) ) {
+ return DiskLoc();
+ }
+ }
+ else {
+ if ( i == nsd->capExtent.ext()->firstRecord ) {
+ return DiskLoc();
+ }
+ }
+ // If we are capFirstNewRecord, advance to prev extent, otherwise just get prev.
+ if ( i == nsd->capFirstNewRecord )
+ i = prevLoop( nsd, nsd->capExtent.ext()->firstRecord );
+ else
+ i = prevLoop( nsd, i );
+ // If we just became last in cap extent, advance past capFirstNewRecord
+ // (We know capExtent.ext()->firstRecord != capFirstNewRecord, since would
+ // have returned DiskLoc() earlier otherwise.)
+ if ( i == nsd->capExtent.ext()->lastRecord )
+ i = reverse()->next( nsd->capFirstNewRecord );
+
+ return i;
+ }
+} // namespace mongo
diff --git a/src/mongo/db/cursor.h b/src/mongo/db/cursor.h
new file mode 100644
index 00000000000..8e9e922733d
--- /dev/null
+++ b/src/mongo/db/cursor.h
@@ -0,0 +1,246 @@
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+
+#include "jsobj.h"
+#include "diskloc.h"
+#include "matcher.h"
+
+namespace mongo {
+
+ class NamespaceDetails;
+ class Record;
+ class CoveredIndexMatcher;
+
+ /* Query cursors, base class. This is for our internal cursors. "ClientCursor" is a separate
+ concept and is for the user's cursor.
+
+ WARNING concurrency: the vfunctions below are called back from within a
+ ClientCursor::ccmutex. Don't cause a deadlock, you've been warned.
+ */
+ class Cursor : boost::noncopyable {
+ public:
+ virtual ~Cursor() {}
+ virtual bool ok() = 0;
+ bool eof() { return !ok(); }
+ virtual Record* _current() = 0;
+ virtual BSONObj current() = 0;
+ virtual DiskLoc currLoc() = 0;
+ virtual bool advance() = 0; /*true=ok*/
+ virtual BSONObj currKey() const { return BSONObj(); }
+
+ // DiskLoc the cursor requires for continued operation. Before this
+ // DiskLoc is deleted, the cursor must be incremented or destroyed.
+ virtual DiskLoc refLoc() = 0;
+
+ /* Implement these if you want the cursor to be "tailable" */
+
+ /* Request that the cursor starts tailing after advancing past last record. */
+ /* The implementation may or may not honor this request. */
+ virtual void setTailable() {}
+ /* indicates if tailing is enabled. */
+ virtual bool tailable() {
+ return false;
+ }
+
+ virtual void aboutToDeleteBucket(const DiskLoc& b) { }
+
+ /* optional to implement. if implemented, means 'this' is a prototype */
+ virtual Cursor* clone() {
+ return 0;
+ }
+
+ virtual BSONObj indexKeyPattern() {
+ return BSONObj();
+ }
+
+ virtual bool supportGetMore() = 0;
+
+ /* called after every query block is iterated -- i.e. between getMore() blocks
+ so you can note where we are, if necessary.
+ */
+ virtual void noteLocation() { }
+
+ /* called before query getmore block is iterated */
+ virtual void checkLocation() { }
+
+ /**
+ * Called before a document pointed at by an earlier iterate of this cursor is to be
+ * modified. It is ok if the current iterate also points to the document to be modified.
+ */
+ virtual void prepareToTouchEarlierIterate() { noteLocation(); }
+
+ /** Recover from a previous call to prepareToTouchEarlierIterate(). */
+ virtual void recoverFromTouchingEarlierIterate() { checkLocation(); }
+
+ virtual bool supportYields() = 0;
+
+ /** Called before a ClientCursor yield. */
+ virtual bool prepareToYield() { noteLocation(); return supportYields(); }
+
+ /** Called after a ClientCursor yield. Recovers from a previous call to prepareToYield(). */
+ virtual void recoverFromYield() { checkLocation(); }
+
+ virtual string toString() { return "abstract?"; }
+
+ /* used for multikey index traversal to avoid sending back dups. see Matcher::matches().
+ if a multikey index traversal:
+ if loc has already been sent, returns true.
+ otherwise, marks loc as sent.
+ */
+ virtual bool getsetdup(DiskLoc loc) = 0;
+
+ virtual bool isMultiKey() const = 0;
+
+ virtual bool autoDedup() const { return true; }
+
+ /**
+ * return true if the keys in the index have been modified from the main doc
+ * if you have { a : 1 , b : [ 1 , 2 ] }
+ * an index on { a : 1 } would not be modified
+ * an index on { b : 1 } would be since the values of the array are put in the index
+ * not the array
+ */
+ virtual bool modifiedKeys() const = 0;
+
+ virtual BSONObj prettyIndexBounds() const { return BSONArray(); }
+
+ virtual bool capped() const { return false; }
+
+ virtual long long nscanned() = 0;
+
+ // The implementation may return different matchers depending on the
+ // position of the cursor. If matcher() is nonzero at the start,
+ // matcher() should be checked each time advance() is called.
+ // Implementations which generate their own matcher should return this
+ // to avoid a matcher being set manually.
+ // Note that the return values differ subtly here
+
+ // Used when we want fast matcher lookup
+ virtual CoveredIndexMatcher *matcher() const { return 0; }
+ // Used when we need to share this matcher with someone else
+ virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { return shared_ptr< CoveredIndexMatcher >(); }
+
+ virtual bool currentMatches( MatchDetails *details = 0 ) {
+ return !matcher() || matcher()->matchesCurrent( this, details );
+ }
+
+ // A convenience function for setting the value of matcher() manually
+ // so it may accessed later. Implementations which must generate
+ // their own matcher() should assert here.
+ virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) {
+ massert( 13285, "manual matcher config not allowed", false );
+ }
+
+ virtual void explainDetails( BSONObjBuilder& b ) { return; }
+ };
+
+ // strategy object implementing direction of traversal.
+ class AdvanceStrategy {
+ public:
+ virtual ~AdvanceStrategy() { }
+ virtual DiskLoc next( const DiskLoc &prev ) const = 0;
+ };
+
+ const AdvanceStrategy *forward();
+ const AdvanceStrategy *reverse();
+
+ /* table-scan style cursor */
+ class BasicCursor : public Cursor {
+ public:
+ BasicCursor(DiskLoc dl, const AdvanceStrategy *_s = forward()) : curr(dl), s( _s ), _nscanned() {
+ incNscanned();
+ init();
+ }
+ BasicCursor(const AdvanceStrategy *_s = forward()) : s( _s ), _nscanned() {
+ init();
+ }
+ bool ok() { return !curr.isNull(); }
+ Record* _current() {
+ assert( ok() );
+ return curr.rec();
+ }
+ BSONObj current() {
+ Record *r = _current();
+ BSONObj j(r);
+ return j;
+ }
+ virtual DiskLoc currLoc() { return curr; }
+ virtual DiskLoc refLoc() { return curr.isNull() ? last : curr; }
+ bool advance();
+ virtual string toString() { return "BasicCursor"; }
+ virtual void setTailable() {
+ if ( !curr.isNull() || !last.isNull() )
+ tailable_ = true;
+ }
+ virtual bool tailable() { return tailable_; }
+ virtual bool getsetdup(DiskLoc loc) { return false; }
+ virtual bool isMultiKey() const { return false; }
+ virtual bool modifiedKeys() const { return false; }
+ virtual bool supportGetMore() { return true; }
+ virtual bool supportYields() { return true; }
+ virtual CoveredIndexMatcher *matcher() const { return _matcher.get(); }
+ virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { return _matcher; }
+ virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) { _matcher = matcher; }
+ virtual long long nscanned() { return _nscanned; }
+
+ protected:
+ DiskLoc curr, last;
+ const AdvanceStrategy *s;
+ void incNscanned() { if ( !curr.isNull() ) { ++_nscanned; } }
+ private:
+ bool tailable_;
+ shared_ptr< CoveredIndexMatcher > _matcher;
+ long long _nscanned;
+ void init() { tailable_ = false; }
+ };
+
+ /* used for order { $natural: -1 } */
+ class ReverseCursor : public BasicCursor {
+ public:
+ ReverseCursor(DiskLoc dl) : BasicCursor( dl, reverse() ) { }
+ ReverseCursor() : BasicCursor( reverse() ) { }
+ virtual string toString() { return "ReverseCursor"; }
+ };
+
+ class ForwardCappedCursor : public BasicCursor, public AdvanceStrategy {
+ public:
+ ForwardCappedCursor( NamespaceDetails *nsd = 0, const DiskLoc &startLoc = DiskLoc() );
+ virtual string toString() {
+ return "ForwardCappedCursor";
+ }
+ virtual DiskLoc next( const DiskLoc &prev ) const;
+ virtual bool capped() const { return true; }
+ private:
+ NamespaceDetails *nsd;
+ };
+
+ class ReverseCappedCursor : public BasicCursor, public AdvanceStrategy {
+ public:
+ ReverseCappedCursor( NamespaceDetails *nsd = 0, const DiskLoc &startLoc = DiskLoc() );
+ virtual string toString() {
+ return "ReverseCappedCursor";
+ }
+ virtual DiskLoc next( const DiskLoc &prev ) const;
+ virtual bool capped() const { return true; }
+ private:
+ NamespaceDetails *nsd;
+ };
+
+} // namespace mongo
diff --git a/src/mongo/db/d_concurrency.cpp b/src/mongo/db/d_concurrency.cpp
new file mode 100755
index 00000000000..e3ad974cbfc
--- /dev/null
+++ b/src/mongo/db/d_concurrency.cpp
@@ -0,0 +1,231 @@
+// @file d_concurrency.cpp
+
+#include "pch.h"
+#include "d_concurrency.h"
+#include "../util/concurrency/threadlocal.h"
+#include "../util/concurrency/rwlock.h"
+#include "../util/concurrency/value.h"
+#include "../util/assert_util.h"
+#include "client.h"
+#include "namespacestring.h"
+#include "d_globals.h"
+
+// oplog locking
+// no top level read locks
+// system.profile writing
+// oplog now
+// yielding
+// commitIfNeeded
+
+namespace mongo {
+
+ using namespace clcimpl;
+
+ Client::LockStatus::LockStatus() {
+ excluder=global=collection=0;
+ }
+
+ namespace clcimpl {
+ Shared::Shared(unsigned& _state, RWLock& lock) : state(_state) {
+ rw = 0;
+ if( state ) {
+ // already locked
+ dassert( (state & (AcquireShared|AcquireExclusive)) == 0 );
+ return;
+ }
+ rw = &lock;
+ state = AcquireShared;
+ rw->lock_shared();
+ state = LockedShared;
+ }
+ Shared::~Shared() {
+ if( rw ) {
+ state = Unlocked;
+ rw->unlock_shared();
+ }
+ }
+ Exclusive::Exclusive(unsigned& _state, RWLock& lock) : state(_state) {
+ rw = 0;
+ if( state ) {
+ // already locked
+ dassert( (state & (AcquireShared|AcquireExclusive)) == 0 );
+ assert( state == LockedExclusive ); // can't be in shared state
+ return;
+ }
+ rw = &lock;
+ state = AcquireExclusive;
+ rw->lock();
+ state = LockedExclusive;
+ }
+ Exclusive::~Exclusive() {
+ if( rw ) {
+ state = Unlocked;
+ rw->unlock();
+ }
+ }
+ } // clcimpl namespace
+
+ // this tie-in temporary until MongoMutex is folded in more directly.
+ // called when the lock has been achieved
+ void MongoMutex::lockedExclusively() {
+ Client& c = cc();
+ curopGotLock(&c); // hopefully lockStatus replaces one day
+ c.lockStatus.global = clcimpl::LockedExclusive;
+ _minfo.entered(); // hopefully eliminate one day
+ }
+
+ void MongoMutex::unlockingExclusively() {
+ Client& c = cc();
+ _minfo.leaving();
+ c.lockStatus.global = Unlocked;
+ }
+
+ MongoMutex::MongoMutex(const char *name) : _m(name) {
+ static int n = 0;
+ assert( ++n == 1 ); // below releasingWriteLock we assume MongoMutex is a singleton, and uses dbMutex ref above
+ _remapPrivateViewRequested = false;
+ }
+
+ bool subcollectionOf(const string& parent, const char *child) {
+ if( parent == child )
+ return true;
+ if( !str::startsWith(child, parent) )
+ return false;
+ const char *p = child + parent.size();
+ uassert(15963, str::stream() << "bad collection name: " << child, !str::endsWith(p, '.'));
+ return *p == '.' && p[1] == '$';
+ }
+
+ // (maybe tbd) ...
+ // we will use the global write lock for writing to system.* collections for simplicity
+ // for now; this has some advantages in speed as we don't need to latch just for that then;
+ // also there are cases to be handled carefully otherwise such as namespacedetails methods
+ // reaching into system.indexes implicitly
+ // exception : system.profile
+ static bool lkspecial(const string& ns) {
+ NamespaceString s(ns);
+ return s.isSystem() && s.coll != "system.profile";
+ }
+
+ /** Notes on d.writeExcluder
+ we want to be able to block any attempted write while allowing reads; additionally
+ force non-greedy acquisition so that reads can continue --
+ that is, disallow greediness of write lock acquisitions. This is for that purpose. The
+ #1 need is by groupCommitWithLimitedLocks() but useful elsewhere such as for lock and fsync.
+ */
+
+ ExcludeAllWrites::ExcludeAllWrites() :
+ lk(cc().lockStatus.excluder, d.writeExcluder),
+ gslk()
+ {
+ LOG(3) << "ExcludeAllWrites" << endl;
+ wassert( !d.dbMutex.isWriteLocked() );
+ };
+ ExcludeAllWrites::~ExcludeAllWrites() {
+ }
+
+ // CLC turns on the "collection level concurrency" code
+ // (which is under development and not finished)
+#if defined(CLC)
+ // called after a context is set. check that the correct collection is locked
+ void Client::checkLocks() const {
+ DEV {
+ if( !d.dbMutex.isWriteLocked() ) {
+ const char *n = ns();
+ if( lockStatus.whichCollection.empty() ) {
+ log() << "DEBUG checkLocks error expected to already be locked: " << n << endl;
+ dassert(false);
+ }
+ dassert( subcollectionOf(lockStatus.whichCollection, n) || lkspecial(n) );
+ }
+ }
+ }
+#endif
+
+ // we don't keep these locks in the namespacedetailstransient and Database
+ // objects -- that makes things safer as we need not prove to ourselves that they
+ // are always in scope when we need them.
+ // todo: we don't clean these locks up yet.
+ // todo: avoiding the mutex here might be nice.
+ class LockObjectForEachCollection {
+ //mapsf<string,RWLock*> dblocks;
+ mapsf<string,RWLock*> nslocks;
+ public:
+ /*RWLock& fordb(string db) {
+ mapsf<string,RWLock*>::ref r(dblocks);
+ RWLock*& rw = r[db];
+ if( rw == 0 )
+ rw = new RWLock(0);
+ return *rw;
+ }*/
+ RWLock& forns(string ns) {
+ mapsf<string,RWLock*>::ref r(nslocks);
+#if defined(CLC)
+ massert(15964, str::stream() << "bad collection name to lock: " << ns, str::contains(ns, '.'));
+#endif
+ RWLock*& rw = r[ns];
+ if( rw == 0 ) {
+ rw = new RWLock(0);
+ }
+ return *rw;
+ }
+ } theLocks;
+
+#if defined(CLC)
+ LockCollectionForWriting::Locks::Locks(string ns) :
+ excluder(d.writeExcluder),
+ gslk(),
+ clk(theLocks.forns(ns),true)
+ { }
+ LockCollectionForWriting::~LockCollectionForWriting() {
+ if( locks.get() ) {
+ Client::LockStatus& s = cc().lockStatus;
+ s.whichCollection.clear();
+ }
+ }
+ LockCollectionForWriting::LockCollectionForWriting(string coll)
+ {
+ Client::LockStatus& s = cc().lockStatus;
+ LockBits b(s.state);
+ if( !s.whichCollection.empty() ) {
+ if( !subcollectionOf(s.whichCollection, coll.c_str()) ) {
+ massert(15937, str::stream() << "can't nest lock of " << coll << " beneath " << s.whichCollection, false);
+ }
+ if( b.get(LockBits::Collection) != LockBits::Exclusive ) {
+ massert(15938, str::stream() << "want collection write lock but it is already read locked " << s.state, false);
+ }
+ return;
+ }
+ verify(15965, !lkspecial(coll)); // you must global write lock for writes to special's
+ s.whichCollection = coll;
+ b.set(LockBits::Collection, LockBits::NotLocked, LockBits::Exclusive);
+ locks.reset( new Locks(coll) );
+ }
+#endif
+
+ LockCollectionForReading::LockCollectionForReading(string ns) :
+ gslk(),
+ clk( cc().lockStatus.collection, theLocks.forns(ns) )
+ {
+ Client::LockStatus& s = cc().lockStatus;
+ if( s.whichCollection.empty() ) {
+ s.whichCollection = ns;
+ }
+ else {
+ if( !subcollectionOf(s.whichCollection, ns.c_str()) ) {
+ if( lkspecial(ns) )
+ return;
+ massert(15939,
+ str::stream() << "can't nest lock of " << ns << " beneath " << s.whichCollection,
+ false);
+ }
+ }
+ }
+ LockCollectionForReading::~LockCollectionForReading() {
+ if( !clk.recursed() ) {
+ Client::LockStatus& s = cc().lockStatus;
+ s.whichCollection.clear();
+ }
+ }
+
+}
diff --git a/src/mongo/db/d_concurrency.h b/src/mongo/db/d_concurrency.h
new file mode 100644
index 00000000000..ba2f64f5126
--- /dev/null
+++ b/src/mongo/db/d_concurrency.h
@@ -0,0 +1,67 @@
+// @file d_concurrency.h
+
+#pragma once
+
+#include "../util/concurrency/rwlock.h"
+#include "db/mongomutex.h"
+
+namespace mongo {
+
+ namespace clcimpl {
+ enum LockStates { Unlocked, AcquireShared=1, LockedShared=2, AcquireExclusive=4, LockedExclusive=8 };
+ class Shared : boost::noncopyable {
+ unsigned& state;
+ RWLock *rw;
+ public:
+ Shared(unsigned& state, RWLock& lock);
+ ~Shared();
+ bool recursed() const { return rw == 0; }
+ };
+ class Exclusive : boost::noncopyable {
+ unsigned& state;
+ RWLock *rw;
+ public:
+ Exclusive(unsigned& state, RWLock& lock);
+ ~Exclusive();
+ };
+ }
+
+ typedef readlock GlobalSharedLock;
+
+ class ExcludeAllWrites : boost::noncopyable {
+ clcimpl::Exclusive lk;
+ GlobalSharedLock gslk;
+ public:
+ ExcludeAllWrites();
+ ~ExcludeAllWrites();
+ };
+
+ class todoGlobalWriteLock : boost::noncopyable {
+ public:
+ };
+
+ class LockCollectionForReading : boost::noncopyable {
+ GlobalSharedLock gslk;
+ clcimpl::Shared clk;
+ public:
+ LockCollectionForReading(string coll);
+ ~LockCollectionForReading();
+ };
+
+#if defined(CLC)
+ class LockCollectionForWriting : boost::noncopyable {
+ struct Locks {
+ Locks(string ns);
+ SimpleRWLock::Shared excluder;
+ GlobalSharedLock gslk;
+ rwlock clk;
+ };
+ scoped_ptr<Locks> locks;
+ public:
+ LockCollectionForWriting(string db);
+ ~LockCollectionForWriting();
+ };
+#else
+#endif
+
+}
diff --git a/src/mongo/db/d_globals.cpp b/src/mongo/db/d_globals.cpp
new file mode 100644
index 00000000000..7e0fd9e8cb0
--- /dev/null
+++ b/src/mongo/db/d_globals.cpp
@@ -0,0 +1,20 @@
+// @file d_globals.cpp
+
+#include "pch.h"
+#include "d_globals.h"
+#include "../util/concurrency/rwlock.h"
+#include "clientcursor.h"
+#include "mongomutex.h"
+
+namespace mongo {
+
+ DGlobals::DGlobals() :
+ writeExcluder( *(new RWLock("writeexcluder")) ),
+ dbMutex( *(new MongoMutex("dbMutex")) ),
+ clientCursorMonitor( *(new ClientCursorMonitor()) )
+ {
+ }
+
+ DGlobals d;
+
+}
diff --git a/src/mongo/db/d_globals.h b/src/mongo/db/d_globals.h
new file mode 100644
index 00000000000..7c95d463cc3
--- /dev/null
+++ b/src/mongo/db/d_globals.h
@@ -0,0 +1,27 @@
+// @file d_globals.h
+//
+// these are global variables used in mongod ("d"). also used in test binary as that is effectively a variation on mongod code.
+// that is, these are not in mongos.
+//
+
+#pragma once
+
+namespace mongo {
+
+ class RWLock;
+ class MongoMutex;
+ class ClientCursorMonitor;
+
+ struct DGlobals : boost::noncopyable {
+ DGlobals();
+
+ // these are intentionally never deleted:
+ RWLock& writeExcluder;
+ MongoMutex &dbMutex;
+ ClientCursorMonitor& clientCursorMonitor;
+
+ };
+
+ extern DGlobals d;
+
+};
diff --git a/src/mongo/db/database.cpp b/src/mongo/db/database.cpp
new file mode 100644
index 00000000000..2d55fd35626
--- /dev/null
+++ b/src/mongo/db/database.cpp
@@ -0,0 +1,423 @@
+// database.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "pdfile.h"
+#include "database.h"
+#include "instance.h"
+#include "clientcursor.h"
+#include "databaseholder.h"
+
+namespace mongo {
+
+ bool Database::_openAllFiles = true;
+
+ void assertDbAtLeastReadLocked(const Database *) {
+ // temp impl
+ d.dbMutex.assertAtLeastReadLocked();
+ }
+
+ void assertDbWriteLocked(const Database *) {
+ // temp impl
+ d.dbMutex.assertWriteLocked();
+ }
+
+ Database::~Database() {
+ d.dbMutex.assertWriteLocked();
+ magic = 0;
+ size_t n = _files.size();
+ for ( size_t i = 0; i < n; i++ )
+ delete _files[i];
+ if( ccByLoc.size() ) {
+ log() << "\n\n\nWARNING: ccByLoc not empty on database close! " << ccByLoc.size() << ' ' << name << endl;
+ }
+ }
+
+ Database::Database(const char *nm, bool& newDb, const string& _path )
+ : name(nm), path(_path), namespaceIndex( path, name ),
+ profileName(name + ".system.profile")
+ {
+ try {
+ {
+ // check db name is valid
+ size_t L = strlen(nm);
+ uassert( 10028 , "db name is empty", L > 0 );
+ uassert( 10032 , "db name too long", L < 64 );
+ uassert( 10029 , "bad db name [1]", *nm != '.' );
+ uassert( 10030 , "bad db name [2]", nm[L-1] != '.' );
+ uassert( 10031 , "bad char(s) in db name", strchr(nm, ' ') == 0 );
+ }
+ newDb = namespaceIndex.exists();
+ profile = cmdLine.defaultProfile;
+ checkDuplicateUncasedNames(true);
+ // If already exists, open. Otherwise behave as if empty until
+ // there's a write, then open.
+ if ( ! newDb || cmdLine.defaultProfile ) {
+ namespaceIndex.init();
+ if( _openAllFiles )
+ openAllFiles();
+ }
+ magic = 781231;
+ } catch(std::exception& e) {
+ log() << "warning database " << path << ' ' << nm << " could not be opened" << endl;
+ log() << e.what() << endl;
+ // since destructor won't be called:
+ for ( size_t i = 0; i < _files.size(); i++ )
+ delete _files[i];
+ throw;
+ }
+ }
+
+ void Database::checkDuplicateUncasedNames(bool inholderlock) const {
+ string duplicate = duplicateUncasedName(inholderlock, name, path );
+ if ( !duplicate.empty() ) {
+ stringstream ss;
+ ss << "db already exists with different case other: [" << duplicate << "] me [" << name << "]";
+ uasserted( DatabaseDifferCaseCode , ss.str() );
+ }
+ }
+
+ /*static*/
+ string Database::duplicateUncasedName( bool inholderlock, const string &name, const string &path, set< string > *duplicates ) {
+ d.dbMutex.assertAtLeastReadLocked();
+
+ if ( duplicates ) {
+ duplicates->clear();
+ }
+
+ vector<string> others;
+ getDatabaseNames( others , path );
+
+ set<string> allShortNames;
+ dbHolder().getAllShortNames( inholderlock, allShortNames );
+
+ others.insert( others.end(), allShortNames.begin(), allShortNames.end() );
+
+ for ( unsigned i=0; i<others.size(); i++ ) {
+
+ if ( strcasecmp( others[i].c_str() , name.c_str() ) )
+ continue;
+
+ if ( strcmp( others[i].c_str() , name.c_str() ) == 0 )
+ continue;
+
+ if ( duplicates ) {
+ duplicates->insert( others[i] );
+ } else {
+ return others[i];
+ }
+ }
+ if ( duplicates ) {
+ return duplicates->empty() ? "" : *duplicates->begin();
+ }
+ return "";
+ }
+
+ boost::filesystem::path Database::fileName( int n ) const {
+ stringstream ss;
+ ss << name << '.' << n;
+ boost::filesystem::path fullName;
+ fullName = boost::filesystem::path(path);
+ if ( directoryperdb )
+ fullName /= name;
+ fullName /= ss.str();
+ return fullName;
+ }
+
+ bool Database::openExistingFile( int n ) {
+ assert(this);
+ d.dbMutex.assertWriteLocked();
+ {
+ // must not yet be visible to others as we aren't in the db's write lock and
+ // we will write to _files vector - thus this assert.
+ bool loaded = dbHolder().__isLoaded(name, path);
+ assert( !loaded );
+ }
+ // additionally must be in the dbholder mutex (no assert for that yet)
+
+ // todo: why here? that could be bad as we may be read locked only here
+ namespaceIndex.init();
+
+ if ( n < 0 || n >= DiskLoc::MaxFiles ) {
+ massert( 15924 , str::stream() << "getFile(): bad file number value " << n << " (corrupt db?): run repair", false);
+ }
+
+ {
+ if( n < (int) _files.size() && _files[n] ) {
+ dlog(2) << "openExistingFile " << n << " is already open" << endl;
+ return true;
+ }
+ }
+
+ {
+ boost::filesystem::path fullName = fileName( n );
+ string fullNameString = fullName.string();
+ MongoDataFile *df = new MongoDataFile(n);
+ try {
+ if( !df->openExisting( fullNameString.c_str() ) ) {
+ delete df;
+ return false;
+ }
+ }
+ catch ( AssertionException& ) {
+ delete df;
+ throw;
+ }
+ while ( n >= (int) _files.size() ) {
+ _files.push_back(0);
+ }
+ _files[n] = df;
+ }
+
+ return true;
+ }
+
+ // todo : we stop once a datafile dne.
+ // if one datafile were missing we should keep going for
+ // repair purposes yet we do not.
+ void Database::openAllFiles() {
+ //log() << "TEMP openallfiles " << path << ' ' << name << endl;
+ assert(this);
+ int n = 0;
+ while( openExistingFile(n) ) {
+ n++;
+ }
+
+ /*
+ int n = 0;
+ while( exists(n) ) {
+ getFile(n);
+ n++;
+ }
+ // If last file is empty, consider it preallocated and make sure it's not mapped
+ // until a write is requested
+ if ( n > 1 && getFile( n - 1 )->getHeader()->isEmpty() ) {
+ delete _files[ n - 1 ];
+ _files.pop_back();
+ }*/
+ }
+
+ // todo: this is called a lot. streamline the common case
+ MongoDataFile* Database::getFile( int n, int sizeNeeded , bool preallocateOnly) {
+ assert(this);
+ DEV assertDbAtLeastReadLocked(this);
+
+ namespaceIndex.init();
+ if ( n < 0 || n >= DiskLoc::MaxFiles ) {
+ out() << "getFile(): n=" << n << endl;
+ massert( 10295 , "getFile(): bad file number value (corrupt db?): run repair", false);
+ }
+ DEV {
+ if ( n > 100 ) {
+ out() << "getFile(): n=" << n << endl;
+ }
+ }
+ MongoDataFile* p = 0;
+ if ( !preallocateOnly ) {
+ while ( n >= (int) _files.size() ) {
+ DEV if( !d.dbMutex.isWriteLocked() ) {
+ log() << "error: getFile() called in a read lock, yet file to return is not yet open" << endl;
+ log() << " getFile(" << n << ") _files.size:" <<_files.size() << ' ' << fileName(n).string() << endl;
+ log() << " context ns: " << cc().ns() << " openallfiles:" << _openAllFiles << endl;
+ }
+ assertDbWriteLocked(this);
+ _files.push_back(0);
+ }
+ p = _files[n];
+ }
+ if ( p == 0 ) {
+ assertDbWriteLocked(this);
+ boost::filesystem::path fullName = fileName( n );
+ string fullNameString = fullName.string();
+ p = new MongoDataFile(n);
+ int minSize = 0;
+ if ( n != 0 && _files[ n - 1 ] )
+ minSize = _files[ n - 1 ]->getHeader()->fileLength;
+ if ( sizeNeeded + DataFileHeader::HeaderSize > minSize )
+ minSize = sizeNeeded + DataFileHeader::HeaderSize;
+ try {
+ p->open( fullNameString.c_str(), minSize, preallocateOnly );
+ }
+ catch ( AssertionException& ) {
+ delete p;
+ throw;
+ }
+ if ( preallocateOnly )
+ delete p;
+ else
+ _files[n] = p;
+ }
+ return preallocateOnly ? 0 : p;
+ }
+
+ MongoDataFile* Database::addAFile( int sizeNeeded, bool preallocateNextFile ) {
+ assertDbWriteLocked(this);
+ int n = (int) _files.size();
+ MongoDataFile *ret = getFile( n, sizeNeeded );
+ if ( preallocateNextFile )
+ preallocateAFile();
+ return ret;
+ }
+
+ bool fileIndexExceedsQuota( const char *ns, int fileIndex, bool enforceQuota ) {
+ return
+ cmdLine.quota &&
+ enforceQuota &&
+ fileIndex >= cmdLine.quotaFiles &&
+ // we don't enforce the quota on "special" namespaces as that could lead to problems -- e.g.
+ // rejecting an index insert after inserting the main record.
+ !NamespaceString::special( ns ) &&
+ NamespaceString( ns ).db != "local";
+ }
+
+ MongoDataFile* Database::suitableFile( const char *ns, int sizeNeeded, bool preallocate, bool enforceQuota ) {
+
+ // check existing files
+ for ( int i=numFiles()-1; i>=0; i-- ) {
+ MongoDataFile* f = getFile( i );
+ if ( f->getHeader()->unusedLength >= sizeNeeded ) {
+ if ( fileIndexExceedsQuota( ns, i-1, enforceQuota ) ) // NOTE i-1 is the value used historically for this check.
+ ;
+ else
+ return f;
+ }
+ }
+
+ if ( fileIndexExceedsQuota( ns, numFiles(), enforceQuota ) )
+ uasserted(12501, "quota exceeded");
+
+ // allocate files until we either get one big enough or hit maxSize
+ for ( int i = 0; i < 8; i++ ) {
+ MongoDataFile* f = addAFile( sizeNeeded, preallocate );
+
+ if ( f->getHeader()->unusedLength >= sizeNeeded )
+ return f;
+
+ if ( f->getHeader()->fileLength >= MongoDataFile::maxSize() ) // this is as big as they get so might as well stop
+ return f;
+ }
+
+ uasserted(14810, "couldn't allocate space (suitableFile)"); // callers don't check for null return code
+ return 0;
+ }
+
+ MongoDataFile* Database::newestFile() {
+ int n = numFiles();
+ if ( n == 0 )
+ return 0;
+ return getFile(n-1);
+ }
+
+
+ Extent* Database::allocExtent( const char *ns, int size, bool capped, bool enforceQuota ) {
+ // todo: when profiling, these may be worth logging into profile collection
+ bool fromFreeList = true;
+ Extent *e = DataFileMgr::allocFromFreeList( ns, size, capped );
+ if( e == 0 ) {
+ fromFreeList = false;
+ e = suitableFile( ns, size, !capped, enforceQuota )->createExtent( ns, size, capped );
+ }
+ LOG(1) << "allocExtent " << ns << " size " << size << ' ' << fromFreeList << endl;
+ return e;
+ }
+
+
+ bool Database::setProfilingLevel( int newLevel , string& errmsg ) {
+ if ( profile == newLevel )
+ return true;
+
+ if ( newLevel < 0 || newLevel > 2 ) {
+ errmsg = "profiling level has to be >=0 and <= 2";
+ return false;
+ }
+
+ if ( newLevel == 0 ) {
+ profile = 0;
+ return true;
+ }
+
+ assert( cc().database() == this );
+
+ if ( ! namespaceIndex.details( profileName.c_str() ) ) {
+ log() << "creating profile collection: " << profileName << endl;
+ BSONObjBuilder spec;
+ spec.appendBool( "capped", true );
+ spec.append( "size", 1024*1024 );
+ if ( ! userCreateNS( profileName.c_str(), spec.done(), errmsg , false /* we don't replica profile messages */ ) ) {
+ return false;
+ }
+ }
+ profile = newLevel;
+ return true;
+ }
+
+ bool Database::exists(int n) const {
+ return boost::filesystem::exists( fileName( n ) );
+ }
+
+ int Database::numFiles() const {
+ DEV assertDbAtLeastReadLocked(this);
+ return (int) _files.size();
+ }
+
+ void Database::flushFiles( bool sync ) {
+ assertDbAtLeastReadLocked(this);
+ for( vector<MongoDataFile*>::iterator i = _files.begin(); i != _files.end(); i++ ) {
+ MongoDataFile *f = *i;
+ f->flush(sync);
+ }
+ }
+
+ long long Database::fileSize() const {
+ long long size=0;
+ for (int n=0; exists(n); n++)
+ size += boost::filesystem::file_size( fileName(n) );
+ return size;
+ }
+
+ Database* DatabaseHolder::getOrCreate( const string& ns , const string& path , bool& justCreated ) {
+ d.dbMutex.assertAtLeastReadLocked();
+
+ DBs& m = _paths[path];
+
+ string dbname = _todb( ns );
+
+ {
+ DBs::iterator i = m.find(dbname);
+ if( i != m.end() ) {
+ justCreated = false;
+ return i->second;
+ }
+ }
+
+ // todo: protect against getting sprayed with requests for different db names that DNE -
+ // that would make the DBs map very large. not clear what to do to handle though,
+ // perhaps just log it, which is what we do here with the "> 40" :
+ bool cant = !d.dbMutex.isWriteLocked();
+ if( logLevel >= 1 || m.size() > 40 || cant || DEBUG_BUILD ) {
+ log() << "opening db: " << (path==dbpath?"":path) << ' ' << dbname << endl;
+ }
+ massert(15927, "can't open database in a read lock. if db was just closed, consider retrying the query. might otherwise indicate an internal error", !cant);
+
+ Database *db = new Database( dbname.c_str() , justCreated , path );
+ m[dbname] = db;
+ _size++;
+ return db;
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/database.h b/src/mongo/db/database.h
new file mode 100644
index 00000000000..a7867e20e8c
--- /dev/null
+++ b/src/mongo/db/database.h
@@ -0,0 +1,145 @@
+// database.h
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "cmdline.h"
+#include "namespace.h"
+
+namespace mongo {
+
+ class Extent;
+ class MongoDataFile;
+ class ClientCursor;
+ struct ByLocKey;
+ typedef map<ByLocKey, ClientCursor*> CCByLoc;
+
+ /**
+ * Database represents a database database
+ * Each database database has its own set of files -- dbname.ns, dbname.0, dbname.1, ...
+ * NOT memory mapped
+ */
+ class Database {
+ public:
+ static bool _openAllFiles;
+
+ // you probably need to be in dbHolderMutex when constructing this
+ Database(const char *nm, /*out*/ bool& newDb, const string& _path = dbpath);
+ private:
+ ~Database(); // closes files and other cleanup see below.
+ public:
+ /* you must use this to close - there is essential code in this method that is not in the ~Database destructor.
+ thus the destructor is private. this could be cleaned up one day...
+ */
+ static void closeDatabase( const char *db, const string& path );
+
+ void openAllFiles();
+
+ /**
+ * tries to make sure that this hasn't been deleted
+ */
+ bool isOk() const { return magic == 781231; }
+
+ bool isEmpty() { return ! namespaceIndex.allocated(); }
+
+ /**
+ * total file size of Database in bytes
+ */
+ long long fileSize() const;
+
+ int numFiles() const;
+
+ /**
+ * returns file valid for file number n
+ */
+ boost::filesystem::path fileName( int n ) const;
+
+ private:
+ bool exists(int n) const;
+ bool openExistingFile( int n );
+
+ public:
+ /**
+ * return file n. if it doesn't exist, create it
+ */
+ MongoDataFile* getFile( int n, int sizeNeeded = 0, bool preallocateOnly = false );
+
+ MongoDataFile* addAFile( int sizeNeeded, bool preallocateNextFile );
+
+ /**
+ * makes sure we have an extra file at the end that is empty
+ * safe to call this multiple times - the implementation will only preallocate one file
+ */
+ void preallocateAFile() { getFile( numFiles() , 0, true ); }
+
+ MongoDataFile* suitableFile( const char *ns, int sizeNeeded, bool preallocate, bool enforceQuota );
+
+ Extent* allocExtent( const char *ns, int size, bool capped, bool enforceQuota );
+
+ MongoDataFile* newestFile();
+
+ /**
+ * @return true if success. false if bad level or error creating profile ns
+ */
+ bool setProfilingLevel( int newLevel , string& errmsg );
+
+ void flushFiles( bool sync );
+
+ /**
+ * @return true if ns is part of the database
+ * ns=foo.bar, db=foo returns true
+ */
+ bool ownsNS( const string& ns ) const {
+ if ( ! startsWith( ns , name ) )
+ return false;
+ return ns[name.size()] == '.';
+ }
+ private:
+ /**
+ * @throws DatabaseDifferCaseCode if the name is a duplicate based on
+ * case insensitive matching.
+ */
+ void checkDuplicateUncasedNames(bool inholderlockalready) const;
+ public:
+ /**
+ * @return name of an existing database with same text name but different
+ * casing, if one exists. Otherwise the empty string is returned. If
+ * 'duplicates' is specified, it is filled with all duplicate names.
+ */
+ static string duplicateUncasedName( bool inholderlockalready, const string &name, const string &path, set< string > *duplicates = 0 );
+
+ const string name; // "alleyinsider"
+ const string path;
+
+ private:
+
+ // must be in the dbLock when touching this (and write locked when writing to of course)
+ // however during Database object construction we aren't, which is ok as it isn't yet visible
+ // to others and we are in the dbholder lock then.
+ vector<MongoDataFile*> _files;
+
+ public: // this should be private later
+
+ NamespaceIndex namespaceIndex;
+ int profile; // 0=off.
+ const string profileName; // "alleyinsider.system.profile"
+ CCByLoc ccByLoc;
+ int magic; // used for making sure the object is still loaded in memory
+ };
+
+} // namespace mongo
diff --git a/src/mongo/db/databaseholder.h b/src/mongo/db/databaseholder.h
new file mode 100644
index 00000000000..7c878c4ed63
--- /dev/null
+++ b/src/mongo/db/databaseholder.h
@@ -0,0 +1,126 @@
+// @file databaseholder.h
+
+#pragma once
+
+namespace mongo {
+
+ /**
+ * path + dbname -> Database
+ */
+ class DatabaseHolder {
+ typedef map<string,Database*> DBs;
+ typedef map<string,DBs> Paths;
+ public:
+ DatabaseHolder() : _size(0) { }
+
+ bool __isLoaded( const string& ns , const string& path ) const {
+ Paths::const_iterator x = _paths.find( path );
+ if ( x == _paths.end() )
+ return false;
+ const DBs& m = x->second;
+
+ string db = _todb( ns );
+
+ DBs::const_iterator it = m.find(db);
+ return it != m.end();
+ }
+ // must be write locked as otherwise isLoaded could go false->true on you
+ // in the background and you might not expect that.
+ bool _isLoaded( const string& ns , const string& path ) const {
+ d.dbMutex.assertWriteLocked();
+ return __isLoaded(ns,path);
+ }
+
+ Database * get( const string& ns , const string& path ) const {
+ d.dbMutex.assertAtLeastReadLocked();
+ Paths::const_iterator x = _paths.find( path );
+ if ( x == _paths.end() )
+ return 0;
+ const DBs& m = x->second;
+ string db = _todb( ns );
+ DBs::const_iterator it = m.find(db);
+ if ( it != m.end() )
+ return it->second;
+ return 0;
+ }
+
+ void _put( const string& ns , const string& path , Database * db ) {
+ d.dbMutex.assertAtLeastReadLocked();
+ DBs& m = _paths[path];
+ Database*& d = m[_todb(ns)];
+ if( d ) {
+ dlog(2) << "info dbholder put db was already set " << ns << endl;
+ }
+ else {
+ _size++;
+ }
+ d = db;
+ }
+
+ Database* getOrCreate( const string& ns , const string& path , bool& justCreated );
+
+ void erase( const string& ns , const string& path ) {
+ d.dbMutex.assertWriteLocked(); // write lock req'd as a Database obj can be in use dbHolderMutex is mainly just to control the holder itself
+ DBs& m = _paths[path];
+ _size -= (int)m.erase( _todb( ns ) );
+ }
+
+ /** @param force - force close even if something underway - use at shutdown */
+ bool closeAll( const string& path , BSONObjBuilder& result, bool force );
+
+ // "info" as this is informational only could change on you if you are not write locked
+ int sizeInfo() const { return _size; }
+
+ void forEach(boost::function<void(Database *)> f) const {
+ d.dbMutex.assertWriteLocked();
+ for ( Paths::const_iterator i=_paths.begin(); i!=_paths.end(); i++ ) {
+ DBs m = i->second;
+ for( DBs::const_iterator j=m.begin(); j!=m.end(); j++ ) {
+ f(j->second);
+ }
+ }
+ }
+
+ /**
+ * gets all unique db names, ignoring paths
+ */
+ void getAllShortNames( bool locked, set<string>& all ) const {
+ d.dbMutex.assertAtLeastReadLocked();
+ for ( Paths::const_iterator i=_paths.begin(); i!=_paths.end(); i++ ) {
+ DBs m = i->second;
+ for( DBs::const_iterator j=m.begin(); j!=m.end(); j++ ) {
+ all.insert( j->first );
+ }
+ }
+ }
+
+ private:
+ static string _todb( const string& ns ) {
+ string d = __todb( ns );
+ uassert( 13280 , (string)"invalid db name: " + ns , NamespaceString::validDBName( d ) );
+ return d;
+ }
+ static string __todb( const string& ns ) {
+ size_t i = ns.find( '.' );
+ if ( i == string::npos ) {
+ uassert( 13074 , "db name can't be empty" , ns.size() );
+ return ns;
+ }
+ uassert( 13075 , "db name can't be empty" , i > 0 );
+ return ns.substr( 0 , i );
+ }
+ Paths _paths;
+ int _size;
+ };
+
+ DatabaseHolder& dbHolderUnchecked();
+ inline const DatabaseHolder& dbHolder() {
+ dassert( d.dbMutex.atLeastReadLocked() );
+ return dbHolderUnchecked();
+ }
+ inline DatabaseHolder& dbHolderW() {
+ dassert( d.dbMutex.isWriteLocked() );
+ return dbHolderUnchecked();
+ }
+
+}
diff --git a/src/mongo/db/db.cpp b/src/mongo/db/db.cpp
new file mode 100644
index 00000000000..af03b447976
--- /dev/null
+++ b/src/mongo/db/db.cpp
@@ -0,0 +1,1309 @@
+// @file db.cpp : Defines main() for the mongod program.
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "db.h"
+#include "introspect.h"
+#include "repl.h"
+#include "../util/unittest.h"
+#include "../util/file_allocator.h"
+#include "../util/background.h"
+#include "../util/text.h"
+#include "dbmessage.h"
+#include "instance.h"
+#include "clientcursor.h"
+#include "pdfile.h"
+#include "stats/counters.h"
+#include "repl/rs.h"
+#include "../scripting/engine.h"
+#include "module.h"
+#include "cmdline.h"
+#include "stats/snapshots.h"
+#include "../util/concurrency/task.h"
+#include "../util/version.h"
+#include "../util/ramlog.h"
+#include "../util/net/message_server.h"
+#include "client.h"
+#include "restapi.h"
+#include "dbwebserver.h"
+#include "dur.h"
+#include "concurrency.h"
+#include "../s/d_writeback.h"
+#include "d_globals.h"
+
+#if defined(_WIN32)
+# include "../util/ntservice.h"
+#else
+# include <sys/file.h>
+#endif
+
+namespace mongo {
+
+ namespace dur {
+ extern unsigned long long DataLimitPerJournalFile;
+ }
+
+ /* only off if --nocursors which is for debugging. */
+ extern bool useCursors;
+
+ /* only off if --nohints */
+ extern bool useHints;
+
+ extern int diagLogging;
+ extern unsigned lenForNewNsFiles;
+ extern int lockFile;
+ extern bool checkNsFilesOnLoad;
+ extern string repairpath;
+
+ void setupSignals( bool inFork );
+ void startReplication();
+ void exitCleanly( ExitCode code );
+
+ CmdLine cmdLine;
+ static bool scriptingEnabled = true;
+ bool noHttpInterface = false;
+ bool shouldRepairDatabases = 0;
+ static bool forceRepair = 0;
+ Timer startupSrandTimer;
+
+ const char *ourgetns() {
+ Client *c = currentClient.get();
+ if ( ! c )
+ return "";
+ Client::Context* cc = c->getContext();
+ return cc ? cc->ns() : "";
+ }
+
+ struct MyStartupTests {
+ MyStartupTests() {
+ assert( sizeof(OID) == 12 );
+ }
+ } mystartupdbcpp;
+
+ QueryResult* emptyMoreResult(long long);
+
+
+ /* todo: make this a real test. the stuff in dbtests/ seem to do all dbdirectclient which exhaust doesn't support yet. */
+// QueryOption_Exhaust
+#define TESTEXHAUST 0
+#if( TESTEXHAUST )
+ void testExhaust() {
+ sleepsecs(1);
+ unsigned n = 0;
+ auto f = [&n](const BSONObj& o) {
+ assert( o.valid() );
+ //cout << o << endl;
+ n++;
+ bool testClosingSocketOnError = false;
+ if( testClosingSocketOnError )
+ assert(false);
+ };
+ DBClientConnection db(false);
+ db.connect("localhost");
+ const char *ns = "local.foo";
+ if( db.count(ns) < 10000 )
+ for( int i = 0; i < 20000; i++ )
+ db.insert(ns, BSON("aaa" << 3 << "b" << "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"));
+
+ try {
+ db.query(f, ns, Query() );
+ }
+ catch(...) {
+ cout << "hmmm" << endl;
+ }
+
+ try {
+ db.query(f, ns, Query() );
+ }
+ catch(...) {
+ cout << "caught" << endl;
+ }
+
+ cout << n << endl;
+ };
+#endif
+
+ void sysRuntimeInfo() {
+ out() << "sysinfo:" << endl;
+#if defined(_SC_PAGE_SIZE)
+ out() << " page size: " << (int) sysconf(_SC_PAGE_SIZE) << endl;
+#endif
+#if defined(_SC_PHYS_PAGES)
+ out() << " _SC_PHYS_PAGES: " << sysconf(_SC_PHYS_PAGES) << endl;
+#endif
+#if defined(_SC_AVPHYS_PAGES)
+ out() << " _SC_AVPHYS_PAGES: " << sysconf(_SC_AVPHYS_PAGES) << endl;
+#endif
+ }
+
+ /* if server is really busy, wait a bit */
+ void beNice() {
+ sleepmicros( Client::recommendedYieldMicros() );
+ }
+
+ class MyMessageHandler : public MessageHandler {
+ public:
+ virtual void connected( AbstractMessagingPort* p ) {
+ Client& c = Client::initThread("conn", p);
+ c.getAuthenticationInfo()->isLocalHost = p->remote().isLocalHost();
+ }
+
+ virtual void process( Message& m , AbstractMessagingPort* port , LastError * le) {
+ while ( true ) {
+ if ( inShutdown() ) {
+ log() << "got request after shutdown()" << endl;
+ break;
+ }
+
+ lastError.startRequest( m , le );
+
+ DbResponse dbresponse;
+ assembleResponse( m, dbresponse, port->remote() );
+
+ if ( dbresponse.response ) {
+ port->reply(m, *dbresponse.response, dbresponse.responseTo);
+ if( dbresponse.exhaust ) {
+ MsgData *header = dbresponse.response->header();
+ QueryResult *qr = (QueryResult *) header;
+ long long cursorid = qr->cursorId;
+ if( cursorid ) {
+ assert( dbresponse.exhaust && *dbresponse.exhaust != 0 );
+ string ns = dbresponse.exhaust; // before reset() free's it...
+ m.reset();
+ BufBuilder b(512);
+ b.appendNum((int) 0 /*size set later in appendData()*/);
+ b.appendNum(header->id);
+ b.appendNum(header->responseTo);
+ b.appendNum((int) dbGetMore);
+ b.appendNum((int) 0);
+ b.appendStr(ns);
+ b.appendNum((int) 0); // ntoreturn
+ b.appendNum(cursorid);
+ m.appendData(b.buf(), b.len());
+ b.decouple();
+ DEV log() << "exhaust=true sending more" << endl;
+ beNice();
+ continue; // this goes back to top loop
+ }
+ }
+ }
+ break;
+ }
+ }
+
+ virtual void disconnected( AbstractMessagingPort* p ) {
+ Client * c = currentClient.get();
+ if( c ) c->shutdown();
+ globalScriptEngine->threadDone();
+ }
+
+ };
+
+ void listen(int port) {
+ //testTheDb();
+ MessageServer::Options options;
+ options.port = port;
+ options.ipList = cmdLine.bind_ip;
+
+ MessageServer * server = createServer( options , new MyMessageHandler() );
+ server->setAsTimeTracker();
+
+ startReplication();
+ if ( !noHttpInterface )
+ boost::thread web( boost::bind(&webServerThread, new RestAdminAccess() /* takes ownership */));
+
+#if(TESTEXHAUST)
+ boost::thread thr(testExhaust);
+#endif
+ server->run();
+ }
+
+
+ bool doDBUpgrade( const string& dbName , string errmsg , DataFileHeader * h ) {
+ static DBDirectClient db;
+
+ if ( h->version == 4 && h->versionMinor == 4 ) {
+ assert( PDFILE_VERSION == 4 );
+ assert( PDFILE_VERSION_MINOR == 5 );
+
+ list<string> colls = db.getCollectionNames( dbName );
+ for ( list<string>::iterator i=colls.begin(); i!=colls.end(); i++) {
+ string c = *i;
+ log() << "\t upgrading collection:" << c << endl;
+ BSONObj out;
+ bool ok = db.runCommand( dbName , BSON( "reIndex" << c.substr( dbName.size() + 1 ) ) , out );
+ if ( ! ok ) {
+ errmsg = "reindex failed";
+ log() << "\t\t reindex failed: " << out << endl;
+ return false;
+ }
+ }
+
+ h->versionMinor = 5;
+ return true;
+ }
+
+ // do this in the general case
+ return repairDatabase( dbName.c_str(), errmsg );
+ }
+
+ // ran at startup.
+ static void repairDatabasesAndCheckVersion() {
+ // LastError * le = lastError.get( true );
+ Client::GodScope gs;
+ log(1) << "enter repairDatabases (to check pdfile version #)" << endl;
+
+ //assert(checkNsFilesOnLoad);
+ checkNsFilesOnLoad = false; // we are mainly just checking the header - don't scan the whole .ns file for every db here.
+
+ dblock lk;
+ vector< string > dbNames;
+ getDatabaseNames( dbNames );
+ for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) {
+ string dbName = *i;
+ log(1) << "\t" << dbName << endl;
+ Client::Context ctx( dbName );
+ MongoDataFile *p = cc().database()->getFile( 0 );
+ DataFileHeader *h = p->getHeader();
+ if ( !h->isCurrentVersion() || forceRepair ) {
+
+ if( h->version <= 0 ) {
+ uasserted(14026,
+ str::stream() << "db " << dbName << " appears corrupt pdfile version: " << h->version
+ << " info: " << h->versionMinor << ' ' << h->fileLength);
+ }
+
+ log() << "****" << endl;
+ log() << "****" << endl;
+ log() << "need to upgrade database " << dbName << " with pdfile version " << h->version << "." << h->versionMinor << ", "
+ << "new version: " << PDFILE_VERSION << "." << PDFILE_VERSION_MINOR << endl;
+ if ( shouldRepairDatabases ) {
+ // QUESTION: Repair even if file format is higher version than code?
+ log() << "\t starting upgrade" << endl;
+ string errmsg;
+ assert( doDBUpgrade( dbName , errmsg , h ) );
+ }
+ else {
+ log() << "\t Not upgrading, exiting" << endl;
+ log() << "\t run --upgrade to upgrade dbs, then start again" << endl;
+ log() << "****" << endl;
+ dbexit( EXIT_NEED_UPGRADE );
+ shouldRepairDatabases = 1;
+ return;
+ }
+ }
+ else {
+ Database::closeDatabase( dbName.c_str(), dbpath );
+ }
+ }
+
+ log(1) << "done repairDatabases" << endl;
+
+ if ( shouldRepairDatabases ) {
+ log() << "finished checking dbs" << endl;
+ cc().shutdown();
+ dbexit( EXIT_CLEAN );
+ }
+
+ checkNsFilesOnLoad = true;
+ }
+
+ void clearTmpFiles() {
+ boost::filesystem::path path( dbpath );
+ for ( boost::filesystem::directory_iterator i( path );
+ i != boost::filesystem::directory_iterator(); ++i ) {
+ string fileName = boost::filesystem::path(*i).leaf();
+ if ( boost::filesystem::is_directory( *i ) &&
+ fileName.length() && fileName[ 0 ] == '$' )
+ boost::filesystem::remove_all( *i );
+ }
+ }
+
+ void checkIfReplMissingFromCommandLine() {
+ if( !cmdLine.usingReplSets() ) {
+ Client::GodScope gs;
+ DBDirectClient c;
+ unsigned long long x =
+ c.count("local.system.replset");
+ if( x ) {
+ log() << endl;
+ log() << "** warning: mongod started without --replSet yet " << x << " documents are present in local.system.replset" << endl;
+ log() << "** restart with --replSet unless you are doing maintenance and no other clients are connected" << endl;
+ log() << endl;
+ }
+ }
+ }
+
+ void clearTmpCollections() {
+ writelock lk; // _openAllFiles is false at this point, so this is helpful for the query below to work as you can't open files when readlocked
+ Client::GodScope gs;
+ vector< string > toDelete;
+ DBDirectClient cli;
+ auto_ptr< DBClientCursor > c = cli.query( "local.system.namespaces", Query( fromjson( "{name:/^local.temp./}" ) ) );
+ while( c->more() ) {
+ BSONObj o = c->next();
+ toDelete.push_back( o.getStringField( "name" ) );
+ }
+ for( vector< string >::iterator i = toDelete.begin(); i != toDelete.end(); ++i ) {
+ log() << "Dropping old temporary collection: " << *i << endl;
+ cli.dropCollection( *i );
+ }
+ }
+
+ /**
+ * does background async flushes of mmapped files
+ */
+ class DataFileSync : public BackgroundJob {
+ public:
+ string name() const { return "DataFileSync"; }
+ void run() {
+ if( cmdLine.syncdelay == 0 )
+ log() << "warning: --syncdelay 0 is not recommended and can have strange performance" << endl;
+ else if( cmdLine.syncdelay == 1 )
+ log() << "--syncdelay 1" << endl;
+ else if( cmdLine.syncdelay != 60 )
+ log(1) << "--syncdelay " << cmdLine.syncdelay << endl;
+ int time_flushing = 0;
+ while ( ! inShutdown() ) {
+ _diaglog.flush();
+ if ( cmdLine.syncdelay == 0 ) {
+ // in case at some point we add an option to change at runtime
+ sleepsecs(5);
+ continue;
+ }
+
+ sleepmillis( (long long) std::max(0.0, (cmdLine.syncdelay * 1000) - time_flushing) );
+
+ if ( inShutdown() ) {
+ // occasional issue trying to flush during shutdown when sleep interrupted
+ break;
+ }
+
+ Date_t start = jsTime();
+ int numFiles = MemoryMappedFile::flushAll( true );
+ time_flushing = (int) (jsTime() - start);
+
+ globalFlushCounters.flushed(time_flushing);
+
+ if( logLevel >= 1 || time_flushing >= 10000 ) {
+ log() << "flushing mmaps took " << time_flushing << "ms " << " for " << numFiles << " files" << endl;
+ }
+ }
+ }
+
+ } dataFileSync;
+
+ const char * jsInterruptCallback() {
+ // should be safe to interrupt in js code, even if we have a write lock
+ return killCurrentOp.checkForInterruptNoAssert();
+ }
+
+ unsigned jsGetInterruptSpecCallback() {
+ return cc().curop()->opNum();
+ }
+
+ void _initAndListen(int listenPort ) {
+
+ Client::initThread("initandlisten");
+
+ Database::_openAllFiles = false;
+
+ Logstream::get().addGlobalTee( new RamLog("global") );
+
+ bool is32bit = sizeof(int*) == 4;
+
+ {
+#if !defined(_WIN32)
+ pid_t pid = getpid();
+#else
+ DWORD pid=GetCurrentProcessId();
+#endif
+ Nullstream& l = log();
+ l << "MongoDB starting : pid=" << pid << " port=" << cmdLine.port << " dbpath=" << dbpath;
+ if( replSettings.master ) l << " master=" << replSettings.master;
+ if( replSettings.slave ) l << " slave=" << (int) replSettings.slave;
+ l << ( is32bit ? " 32" : " 64" ) << "-bit host=" << getHostNameCached() << endl;
+ }
+ DEV log() << "_DEBUG build (which is slower)" << endl;
+ show_warnings();
+ log() << mongodVersion() << endl;
+ printGitVersion();
+ printSysInfo();
+ printCommandLineOpts();
+
+ {
+ stringstream ss;
+ ss << endl;
+ ss << "*********************************************************************" << endl;
+ ss << " ERROR: dbpath (" << dbpath << ") does not exist." << endl;
+ ss << " Create this directory or give existing directory in --dbpath." << endl;
+ ss << " See http://www.mongodb.org/display/DOCS/Starting+and+Stopping+Mongo" << endl;
+ ss << "*********************************************************************" << endl;
+ uassert( 10296 , ss.str().c_str(), boost::filesystem::exists( dbpath ) );
+ }
+ {
+ stringstream ss;
+ ss << "repairpath (" << repairpath << ") does not exist";
+ uassert( 12590 , ss.str().c_str(), boost::filesystem::exists( repairpath ) );
+ }
+
+ acquirePathLock(forceRepair);
+ remove_all( dbpath + "/_tmp/" );
+
+ FileAllocator::get()->start();
+
+ MONGO_BOOST_CHECK_EXCEPTION_WITH_MSG( clearTmpFiles(), "clear tmp files" );
+
+ dur::startup();
+
+ if( cmdLine.durOptions & CmdLine::DurRecoverOnly )
+ return;
+
+ // comes after getDur().startup() because this reads from the database
+ clearTmpCollections();
+
+ checkIfReplMissingFromCommandLine();
+
+ Module::initAll();
+
+ if ( scriptingEnabled ) {
+ ScriptEngine::setup();
+ globalScriptEngine->setCheckInterruptCallback( jsInterruptCallback );
+ globalScriptEngine->setGetInterruptSpecCallback( jsGetInterruptSpecCallback );
+ }
+
+ repairDatabasesAndCheckVersion();
+
+ /* we didn't want to pre-open all files for the repair check above. for regular
+ operation we do for read/write lock concurrency reasons.
+ */
+ Database::_openAllFiles = true;
+
+ if ( shouldRepairDatabases )
+ return;
+
+ /* this is for security on certain platforms (nonce generation) */
+ srand((unsigned) (curTimeMicros() ^ startupSrandTimer.micros()));
+
+ snapshotThread.go();
+ d.clientCursorMonitor.go();
+ PeriodicTask::theRunner->go();
+
+#ifndef _WIN32
+ CmdLine::launchOk();
+#endif
+ listen(listenPort);
+
+ // listen() will return when exit code closes its socket.
+ exitCleanly(EXIT_NET_ERROR);
+ }
+
+ void testPretouch();
+
+ void initAndListen(int listenPort) {
+ try {
+ _initAndListen(listenPort);
+ }
+ catch ( DBException &e ) {
+ log() << "exception in initAndListen: " << e.toString() << ", terminating" << endl;
+ dbexit( EXIT_UNCAUGHT );
+ }
+ catch ( std::exception &e ) {
+ log() << "exception in initAndListen std::exception: " << e.what() << ", terminating" << endl;
+ dbexit( EXIT_UNCAUGHT );
+ }
+ catch ( int& n ) {
+ log() << "exception in initAndListen int: " << n << ", terminating" << endl;
+ dbexit( EXIT_UNCAUGHT );
+ }
+ catch(...) {
+ log() << "exception in initAndListen, terminating" << endl;
+ dbexit( EXIT_UNCAUGHT );
+ }
+ }
+
+#if defined(_WIN32)
+ bool initService() {
+ ServiceController::reportStatus( SERVICE_RUNNING );
+ initAndListen( cmdLine.port );
+ return true;
+ }
+#endif
+
+} // namespace mongo
+
+using namespace mongo;
+
+#include <boost/program_options.hpp>
+#undef assert
+#define assert MONGO_assert
+
+namespace po = boost::program_options;
+
+void show_help_text(po::options_description options) {
+ show_warnings();
+ cout << options << endl;
+};
+
+/* Return error string or "" if no errors. */
+string arg_error_check(int argc, char* argv[]) {
+ return "";
+}
+
+int main(int argc, char* argv[]) {
+ static StaticObserver staticObserver;
+ doPreServerStartupInits();
+ getcurns = ourgetns;
+
+ po::options_description general_options("General options");
+#if defined(_WIN32)
+ po::options_description windows_scm_options("Windows Service Control Manager options");
+#endif
+ po::options_description replication_options("Replication options");
+ po::options_description ms_options("Master/slave options");
+ po::options_description rs_options("Replica set options");
+ po::options_description sharding_options("Sharding options");
+ po::options_description visible_options("Allowed options");
+ po::options_description hidden_options("Hidden options");
+
+ po::positional_options_description positional_options;
+
+ CmdLine::addGlobalOptions( general_options , hidden_options );
+
+ general_options.add_options()
+ ("auth", "run with security")
+ ("cpu", "periodically show cpu and iowait utilization")
+ ("dbpath", po::value<string>() , "directory for datafiles")
+ ("diaglog", po::value<int>(), "0=off 1=W 2=R 3=both 7=W+some reads")
+ ("directoryperdb", "each database will be stored in a separate directory")
+ ("journal", "enable journaling")
+ ("journalOptions", po::value<int>(), "journal diagnostic options")
+ ("journalCommitInterval", po::value<unsigned>(), "how often to group/batch commit (ms)")
+ ("ipv6", "enable IPv6 support (disabled by default)")
+ ("jsonp","allow JSONP access via http (has security implications)")
+ ("noauth", "run without security")
+ ("nohttpinterface", "disable http interface")
+ ("nojournal", "disable journaling (journaling is on by default for 64 bit)")
+ ("noprealloc", "disable data file preallocation - will often hurt performance")
+ ("noscripting", "disable scripting engine")
+ ("notablescan", "do not allow table scans")
+ ("nssize", po::value<int>()->default_value(16), ".ns file size (in MB) for new databases")
+ ("profile",po::value<int>(), "0=off 1=slow, 2=all")
+ ("quota", "limits each database to a certain number of files (8 default)")
+ ("quotaFiles", po::value<int>(), "number of files allower per db, requires --quota")
+ ("rest","turn on simple rest api")
+ ("repair", "run repair on all dbs")
+ ("repairpath", po::value<string>() , "root directory for repair files - defaults to dbpath" )
+ ("slowms",po::value<int>(&cmdLine.slowMS)->default_value(100), "value of slow for profile and console log" )
+ ("smallfiles", "use a smaller default file size")
+#if defined(__linux__)
+ ("shutdown", "kill a running server (for init scripts)")
+#endif
+ ("syncdelay",po::value<double>(&cmdLine.syncdelay)->default_value(60), "seconds between disk syncs (0=never, but not recommended)")
+ ("sysinfo", "print some diagnostic system information")
+ ("upgrade", "upgrade db if needed")
+ ;
+
+#if defined(_WIN32)
+ CmdLine::addWindowsOptions( windows_scm_options, hidden_options );
+#endif
+
+ replication_options.add_options()
+ ("oplogSize", po::value<int>(), "size limit (in MB) for op log")
+ ;
+
+ ms_options.add_options()
+ ("master", "master mode")
+ ("slave", "slave mode")
+ ("source", po::value<string>(), "when slave: specify master as <server:port>")
+ ("only", po::value<string>(), "when slave: specify a single database to replicate")
+ ("slavedelay", po::value<int>(), "specify delay (in seconds) to be used when applying master ops to slave")
+ ("autoresync", "automatically resync if slave data is stale")
+ ;
+
+ rs_options.add_options()
+ ("replSet", po::value<string>(), "arg is <setname>[/<optionalseedhostlist>]")
+ ;
+
+ sharding_options.add_options()
+ ("configsvr", "declare this is a config db of a cluster; default port 27019; default dir /data/configdb")
+ ("shardsvr", "declare this is a shard db of a cluster; default port 27018")
+ ("noMoveParanoia" , "turn off paranoid saving of data for moveChunk. this is on by default for now, but default will switch" )
+ ;
+
+ hidden_options.add_options()
+ ("fastsync", "indicate that this instance is starting from a dbpath snapshot of the repl peer")
+ ("pretouch", po::value<int>(), "n pretouch threads for applying replicationed operations") // experimental
+ ("command", po::value< vector<string> >(), "command")
+ ("cacheSize", po::value<long>(), "cache size (in MB) for rec store")
+ ("nodur", "disable journaling")
+ // things we don't want people to use
+ ("nocursors", "diagnostic/debugging option that turns off cursors DO NOT USE IN PRODUCTION")
+ ("nohints", "ignore query hints")
+ ("nopreallocj", "don't preallocate journal files")
+ ("dur", "enable journaling") // old name for --journal
+ ("durOptions", po::value<int>(), "durability diagnostic options") // deprecated name
+ // deprecated pairing command line options
+ ("pairwith", "DEPRECATED")
+ ("arbiter", "DEPRECATED")
+ ("opIdMem", "DEPRECATED")
+ ;
+
+
+ positional_options.add("command", 3);
+ visible_options.add(general_options);
+#if defined(_WIN32)
+ visible_options.add(windows_scm_options);
+#endif
+ visible_options.add(replication_options);
+ visible_options.add(ms_options);
+ visible_options.add(rs_options);
+ visible_options.add(sharding_options);
+ Module::addOptions( visible_options );
+
+ setupCoreSignals();
+ setupSignals( false );
+
+ dbExecCommand = argv[0];
+
+ srand(curTimeMicros());
+#if( BOOST_VERSION >= 104500 )
+ boost::filesystem::path::default_name_check( boost::filesystem2::no_check );
+#else
+ boost::filesystem::path::default_name_check( boost::filesystem::no_check );
+#endif
+
+ {
+ unsigned x = 0x12345678;
+ unsigned char& b = (unsigned char&) x;
+ if ( b != 0x78 ) {
+ out() << "big endian cpus not yet supported" << endl;
+ return 33;
+ }
+ }
+
+ if( argc == 1 )
+ cout << dbExecCommand << " --help for help and startup options" << endl;
+
+ {
+ po::variables_map params;
+
+ string error_message = arg_error_check(argc, argv);
+ if (error_message != "") {
+ cout << error_message << endl << endl;
+ show_help_text(visible_options);
+ return 0;
+ }
+
+ if ( ! CmdLine::store( argc , argv , visible_options , hidden_options , positional_options , params ) )
+ return 0;
+
+ if (params.count("help")) {
+ show_help_text(visible_options);
+ return 0;
+ }
+ if (params.count("version")) {
+ cout << mongodVersion() << endl;
+ printGitVersion();
+ return 0;
+ }
+ if ( params.count( "dbpath" ) ) {
+ dbpath = params["dbpath"].as<string>();
+ if ( params.count( "fork" ) && dbpath[0] != '/' ) {
+ // we need to change dbpath if we fork since we change
+ // cwd to "/"
+ // fork only exists on *nix
+ // so '/' is safe
+ dbpath = cmdLine.cwd + "/" + dbpath;
+ }
+ }
+ else {
+ dbpath = "/data/db/";
+ }
+#ifdef _WIN32
+ if (dbpath.size() > 1 && dbpath[dbpath.size()-1] == '/') {
+ // size() check is for the unlikely possibility of --dbpath "/"
+ dbpath = dbpath.erase(dbpath.size()-1);
+ }
+#endif
+
+ if ( params.count("directoryperdb")) {
+ directoryperdb = true;
+ }
+ if (params.count("cpu")) {
+ cmdLine.cpu = true;
+ }
+ if (params.count("noauth")) {
+ noauth = true;
+ }
+ if (params.count("auth")) {
+ noauth = false;
+ }
+ if (params.count("quota")) {
+ cmdLine.quota = true;
+ }
+ if (params.count("quotaFiles")) {
+ cmdLine.quota = true;
+ cmdLine.quotaFiles = params["quotaFiles"].as<int>() - 1;
+ }
+ bool journalExplicit = false;
+ if( params.count("nodur") || params.count( "nojournal" ) ) {
+ journalExplicit = true;
+ cmdLine.dur = false;
+ }
+ if( params.count("dur") || params.count( "journal" ) ) {
+ if (journalExplicit) {
+ log() << "Can't specify both --journal and --nojournal options." << endl;
+ return EXIT_BADOPTIONS;
+ }
+ journalExplicit = true;
+ cmdLine.dur = true;
+ }
+ if (params.count("durOptions")) {
+ cmdLine.durOptions = params["durOptions"].as<int>();
+ }
+ if( params.count("journalCommitInterval") ) {
+ // don't check if dur is false here as many will just use the default, and will default to off on win32.
+ // ie no point making life a little more complex by giving an error on a dev environment.
+ cmdLine.journalCommitInterval = params["journalCommitInterval"].as<unsigned>();
+ if( cmdLine.journalCommitInterval <= 1 || cmdLine.journalCommitInterval > 300 ) {
+ out() << "--journalCommitInterval out of allowed range (0-300ms)" << endl;
+ dbexit( EXIT_BADOPTIONS );
+ }
+ }
+ if (params.count("journalOptions")) {
+ cmdLine.durOptions = params["journalOptions"].as<int>();
+ }
+ if (params.count("repairpath")) {
+ repairpath = params["repairpath"].as<string>();
+ if (!repairpath.size()) {
+ out() << "repairpath is empty" << endl;
+ dbexit( EXIT_BADOPTIONS );
+ }
+ }
+ if (params.count("nocursors")) {
+ useCursors = false;
+ }
+ if (params.count("nohints")) {
+ useHints = false;
+ }
+ if (params.count("nopreallocj")) {
+ cmdLine.preallocj = false;
+ }
+ if (params.count("nohttpinterface")) {
+ noHttpInterface = true;
+ }
+ if (params.count("rest")) {
+ cmdLine.rest = true;
+ }
+ if (params.count("jsonp")) {
+ cmdLine.jsonp = true;
+ }
+ if (params.count("noscripting")) {
+ scriptingEnabled = false;
+ }
+ if (params.count("noprealloc")) {
+ cmdLine.prealloc = false;
+ cout << "note: noprealloc may hurt performance in many applications" << endl;
+ }
+ if (params.count("smallfiles")) {
+ cmdLine.smallfiles = true;
+ assert( dur::DataLimitPerJournalFile >= 128 * 1024 * 1024 );
+ dur::DataLimitPerJournalFile = 128 * 1024 * 1024;
+ }
+ if (params.count("diaglog")) {
+ int x = params["diaglog"].as<int>();
+ if ( x < 0 || x > 7 ) {
+ out() << "can't interpret --diaglog setting" << endl;
+ dbexit( EXIT_BADOPTIONS );
+ }
+ _diaglog.setLevel(x);
+ }
+ if (params.count("sysinfo")) {
+ sysRuntimeInfo();
+ return 0;
+ }
+ if (params.count("repair")) {
+ Record::MemoryTrackingEnabled = false;
+ shouldRepairDatabases = 1;
+ forceRepair = 1;
+ }
+ if (params.count("upgrade")) {
+ Record::MemoryTrackingEnabled = false;
+ shouldRepairDatabases = 1;
+ }
+ if (params.count("notablescan")) {
+ cmdLine.noTableScan = true;
+ }
+ if (params.count("master")) {
+ replSettings.master = true;
+ }
+ if (params.count("slave")) {
+ replSettings.slave = SimpleSlave;
+ }
+ if (params.count("slavedelay")) {
+ replSettings.slavedelay = params["slavedelay"].as<int>();
+ }
+ if (params.count("fastsync")) {
+ replSettings.fastsync = true;
+ }
+ if (params.count("autoresync")) {
+ replSettings.autoresync = true;
+ if( params.count("replSet") ) {
+ out() << "--autoresync is not used with --replSet" << endl;
+ out() << "see http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << endl;
+ dbexit( EXIT_BADOPTIONS );
+ }
+ }
+ if (params.count("source")) {
+ /* specifies what the source in local.sources should be */
+ cmdLine.source = params["source"].as<string>().c_str();
+ }
+ if( params.count("pretouch") ) {
+ cmdLine.pretouch = params["pretouch"].as<int>();
+ }
+ if (params.count("replSet")) {
+ if (params.count("slavedelay")) {
+ out() << "--slavedelay cannot be used with --replSet" << endl;
+ dbexit( EXIT_BADOPTIONS );
+ }
+ else if (params.count("only")) {
+ out() << "--only cannot be used with --replSet" << endl;
+ dbexit( EXIT_BADOPTIONS );
+ }
+ /* seed list of hosts for the repl set */
+ cmdLine._replSet = params["replSet"].as<string>().c_str();
+ }
+ if (params.count("only")) {
+ cmdLine.only = params["only"].as<string>().c_str();
+ }
+ if( params.count("nssize") ) {
+ int x = params["nssize"].as<int>();
+ if (x <= 0 || x > (0x7fffffff/1024/1024)) {
+ out() << "bad --nssize arg" << endl;
+ dbexit( EXIT_BADOPTIONS );
+ }
+ lenForNewNsFiles = x * 1024 * 1024;
+ assert(lenForNewNsFiles > 0);
+ }
+ if (params.count("oplogSize")) {
+ long long x = params["oplogSize"].as<int>();
+ if (x <= 0) {
+ out() << "bad --oplogSize arg" << endl;
+ dbexit( EXIT_BADOPTIONS );
+ }
+ // note a small size such as x==1 is ok for an arbiter.
+ if( x > 1000 && sizeof(void*) == 4 ) {
+ out() << "--oplogSize of " << x << "MB is too big for 32 bit version. Use 64 bit build instead." << endl;
+ dbexit( EXIT_BADOPTIONS );
+ }
+ cmdLine.oplogSize = x * 1024 * 1024;
+ assert(cmdLine.oplogSize > 0);
+ }
+ if (params.count("cacheSize")) {
+ long x = params["cacheSize"].as<long>();
+ if (x <= 0) {
+ out() << "bad --cacheSize arg" << endl;
+ dbexit( EXIT_BADOPTIONS );
+ }
+ log() << "--cacheSize option not currently supported" << endl;
+ }
+ if (params.count("port") == 0 ) {
+ if( params.count("configsvr") ) {
+ cmdLine.port = CmdLine::ConfigServerPort;
+ }
+ if( params.count("shardsvr") ) {
+ if( params.count("configsvr") ) {
+ log() << "can't do --shardsvr and --configsvr at the same time" << endl;
+ dbexit( EXIT_BADOPTIONS );
+ }
+ cmdLine.port = CmdLine::ShardServerPort;
+ }
+ }
+ else {
+ if ( cmdLine.port <= 0 || cmdLine.port > 65535 ) {
+ out() << "bad --port number" << endl;
+ dbexit( EXIT_BADOPTIONS );
+ }
+ }
+ if ( params.count("configsvr" ) ) {
+ cmdLine.configsvr = true;
+ if (cmdLine.usingReplSets() || replSettings.master || replSettings.slave) {
+ log() << "replication should not be enabled on a config server" << endl;
+ ::exit(-1);
+ }
+ if ( params.count( "nodur" ) == 0 && params.count( "nojournal" ) == 0 )
+ cmdLine.dur = true;
+ if ( params.count( "dbpath" ) == 0 )
+ dbpath = "/data/configdb";
+ }
+ if ( params.count( "profile" ) ) {
+ cmdLine.defaultProfile = params["profile"].as<int>();
+ }
+ if (params.count("ipv6")) {
+ enableIPv6();
+ }
+ if (params.count("noMoveParanoia")) {
+ cmdLine.moveParanoia = false;
+ }
+ if (params.count("pairwith") || params.count("arbiter") || params.count("opIdMem")) {
+ out() << "****" << endl;
+ out() << "Replica Pairs have been deprecated. Invalid options: --pairwith, --arbiter, and/or --opIdMem" << endl;
+ out() << "<http://www.mongodb.org/display/DOCS/Replica+Pairs>" << endl;
+ out() << "****" << endl;
+ dbexit( EXIT_BADOPTIONS );
+ }
+
+ // needs to be after things like --configsvr parsing, thus here.
+ if( repairpath.empty() )
+ repairpath = dbpath;
+
+ Module::configAll( params );
+ dataFileSync.go();
+
+ if (params.count("command")) {
+ vector<string> command = params["command"].as< vector<string> >();
+
+ if (command[0].compare("run") == 0) {
+ if (command.size() > 1) {
+ cout << "Too many parameters to 'run' command" << endl;
+ cout << visible_options << endl;
+ return 0;
+ }
+
+ initAndListen(cmdLine.port);
+ return 0;
+ }
+
+ if (command[0].compare("dbpath") == 0) {
+ cout << dbpath << endl;
+ return 0;
+ }
+
+ cout << "Invalid command: " << command[0] << endl;
+ cout << visible_options << endl;
+ return 0;
+ }
+
+ if( cmdLine.pretouch )
+ log() << "--pretouch " << cmdLine.pretouch << endl;
+
+#ifdef __linux__
+ if (params.count("shutdown")){
+ bool failed = false;
+
+ string name = ( boost::filesystem::path( dbpath ) / "mongod.lock" ).native_file_string();
+ if ( !boost::filesystem::exists( name ) || boost::filesystem::file_size( name ) == 0 )
+ failed = true;
+
+ pid_t pid;
+ string procPath;
+ if (!failed){
+ try {
+ ifstream f (name.c_str());
+ f >> pid;
+ procPath = (str::stream() << "/proc/" << pid);
+ if (!boost::filesystem::exists(procPath))
+ failed = true;
+
+ string exePath = procPath + "/exe";
+ if (boost::filesystem::exists(exePath)){
+ char buf[256];
+ int ret = readlink(exePath.c_str(), buf, sizeof(buf)-1);
+ buf[ret] = '\0'; // readlink doesn't terminate string
+ if (ret == -1) {
+ int e = errno;
+ cerr << "Error resolving " << exePath << ": " << errnoWithDescription(e);
+ failed = true;
+ }
+ else if (!endsWith(buf, "mongod")){
+ cerr << "Process " << pid << " is running " << buf << " not mongod" << endl;
+ ::exit(-1);
+ }
+ }
+ }
+ catch (const std::exception& e){
+ cerr << "Error reading pid from lock file [" << name << "]: " << e.what() << endl;
+ failed = true;
+ }
+ }
+
+ if (failed) {
+ cerr << "There doesn't seem to be a server running with dbpath: " << dbpath << endl;
+ ::exit(-1);
+ }
+
+ cout << "killing process with pid: " << pid << endl;
+ int ret = kill(pid, SIGTERM);
+ if (ret) {
+ int e = errno;
+ cerr << "failed to kill process: " << errnoWithDescription(e) << endl;
+ ::exit(-1);
+ }
+
+ while (boost::filesystem::exists(procPath)) {
+ sleepsecs(1);
+ }
+
+ ::exit(0);
+ }
+#endif
+
+#if defined(_WIN32)
+ if (serviceParamsCheck( params, dbpath, argc, argv )) {
+ return 0;
+ }
+#endif
+
+
+ if (sizeof(void*) == 4 && !journalExplicit){
+ // trying to make this stand out more like startup warnings
+ log() << endl;
+ warning() << "32-bit servers don't have journaling enabled by default. Please use --journal if you want durability." << endl;
+ log() << endl;
+ }
+
+ }
+
+ UnitTest::runTests();
+ initAndListen(cmdLine.port);
+ dbexit(EXIT_CLEAN);
+ return 0;
+}
+
+namespace mongo {
+
+ string getDbContext();
+
+#undef out
+
+
+#if !defined(_WIN32)
+
+} // namespace mongo
+
+#include <signal.h>
+#include <string.h>
+
+namespace mongo {
+
+ void pipeSigHandler( int signal ) {
+#ifdef psignal
+ psignal( signal, "Signal Received : ");
+#else
+ cout << "got pipe signal:" << signal << endl;
+#endif
+ }
+
+ void abruptQuit(int x) {
+ ostringstream ossSig;
+ ossSig << "Got signal: " << x << " (" << strsignal( x ) << ")." << endl;
+ rawOut( ossSig.str() );
+
+ /*
+ ostringstream ossOp;
+ ossOp << "Last op: " << currentOp.infoNoauth() << endl;
+ rawOut( ossOp.str() );
+ */
+
+ ostringstream oss;
+ oss << "Backtrace:" << endl;
+ printStackTrace( oss );
+ rawOut( oss.str() );
+
+ // Don't go through normal shutdown procedure. It may make things worse.
+ ::exit(EXIT_ABRUPT);
+
+ }
+
+ void abruptQuitWithAddrSignal( int signal, siginfo_t *siginfo, void * ) {
+ ostringstream oss;
+ oss << "Invalid";
+ if ( signal == SIGSEGV || signal == SIGBUS ) {
+ oss << " access";
+ } else {
+ oss << " operation";
+ }
+ oss << " at address: " << siginfo->si_addr << endl;
+ rawOut( oss.str() );
+ abruptQuit( signal );
+ }
+
+ sigset_t asyncSignals;
+ // The above signals will be processed by this thread only, in order to
+ // ensure the db and log mutexes aren't held.
+ void interruptThread() {
+ int x;
+ sigwait( &asyncSignals, &x );
+ log() << "got kill or ctrl c or hup signal " << x << " (" << strsignal( x ) << "), will terminate after current cmd ends" << endl;
+ Client::initThread( "interruptThread" );
+ exitCleanly( EXIT_KILL );
+ }
+
+ // this will be called in certain c++ error cases, for example if there are two active
+ // exceptions
+ void myterminate() {
+ rawOut( "terminate() called, printing stack:" );
+ printStackTrace();
+ ::abort();
+ }
+
+ // this gets called when new fails to allocate memory
+ void my_new_handler() {
+ rawOut( "out of memory, printing stack and exiting:" );
+ printStackTrace();
+ ::exit(EXIT_ABRUPT);
+ }
+
+ void setupSignals_ignoreHelper( int signal ) {}
+
+ void setupSignals( bool inFork ) {
+ struct sigaction addrSignals;
+ memset( &addrSignals, 0, sizeof( struct sigaction ) );
+ addrSignals.sa_sigaction = abruptQuitWithAddrSignal;
+ sigemptyset( &addrSignals.sa_mask );
+ addrSignals.sa_flags = SA_SIGINFO;
+
+ assert( sigaction(SIGSEGV, &addrSignals, 0) == 0 );
+ assert( sigaction(SIGBUS, &addrSignals, 0) == 0 );
+ assert( sigaction(SIGILL, &addrSignals, 0) == 0 );
+ assert( sigaction(SIGFPE, &addrSignals, 0) == 0 );
+
+ assert( signal(SIGABRT, abruptQuit) != SIG_ERR );
+ assert( signal(SIGQUIT, abruptQuit) != SIG_ERR );
+ assert( signal(SIGPIPE, pipeSigHandler) != SIG_ERR );
+
+ setupSIGTRAPforGDB();
+
+ sigemptyset( &asyncSignals );
+
+ if ( inFork )
+ assert( signal( SIGHUP , setupSignals_ignoreHelper ) != SIG_ERR );
+ else
+ sigaddset( &asyncSignals, SIGHUP );
+
+ sigaddset( &asyncSignals, SIGINT );
+ sigaddset( &asyncSignals, SIGTERM );
+ assert( pthread_sigmask( SIG_SETMASK, &asyncSignals, 0 ) == 0 );
+ boost::thread it( interruptThread );
+
+ set_terminate( myterminate );
+ set_new_handler( my_new_handler );
+ }
+
+#else
+ void consoleTerminate( const char* controlCodeName ) {
+ Client::initThread( "consoleTerminate" );
+ log() << "got " << controlCodeName << ", will terminate after current cmd ends" << endl;
+ exitCleanly( EXIT_KILL );
+ }
+
+ BOOL CtrlHandler( DWORD fdwCtrlType ) {
+
+ switch( fdwCtrlType ) {
+
+ case CTRL_C_EVENT:
+ rawOut( "Ctrl-C signal" );
+ consoleTerminate( "CTRL_C_EVENT" );
+ return TRUE ;
+
+ case CTRL_CLOSE_EVENT:
+ rawOut( "CTRL_CLOSE_EVENT signal" );
+ consoleTerminate( "CTRL_CLOSE_EVENT" );
+ return TRUE ;
+
+ case CTRL_BREAK_EVENT:
+ rawOut( "CTRL_BREAK_EVENT signal" );
+ consoleTerminate( "CTRL_BREAK_EVENT" );
+ return TRUE;
+
+ case CTRL_LOGOFF_EVENT:
+ rawOut( "CTRL_LOGOFF_EVENT signal" );
+ consoleTerminate( "CTRL_LOGOFF_EVENT" );
+ return TRUE;
+
+ case CTRL_SHUTDOWN_EVENT:
+ rawOut( "CTRL_SHUTDOWN_EVENT signal" );
+ consoleTerminate( "CTRL_SHUTDOWN_EVENT" );
+ return TRUE;
+
+ default:
+ return FALSE;
+ }
+ }
+
+ LPTOP_LEVEL_EXCEPTION_FILTER filtLast = 0;
+ ::HANDLE standardOut = GetStdHandle(STD_OUTPUT_HANDLE);
+ LONG WINAPI exceptionFilter(struct _EXCEPTION_POINTERS *ExceptionInfo) {
+ {
+ // given the severity of the event we write to console in addition to the --logFile
+ // (rawOut writes to the logfile, if a special one were specified)
+ DWORD written;
+ WriteFile(standardOut, "unhandled windows exception\n", 20, &written, 0);
+ FlushFileBuffers(standardOut);
+ }
+
+ DWORD ec = ExceptionInfo->ExceptionRecord->ExceptionCode;
+ if( ec == EXCEPTION_ACCESS_VIOLATION ) {
+ rawOut("access violation");
+ }
+ else {
+ rawOut("unhandled windows exception");
+ char buf[64];
+ strcpy(buf, "ec=0x");
+ _ui64toa(ec, buf+5, 16);
+ rawOut(buf);
+ }
+ if( filtLast )
+ return filtLast(ExceptionInfo);
+ return EXCEPTION_EXECUTE_HANDLER;
+ }
+
+ // called by mongoAbort()
+ extern void (*reportEventToSystem)(const char *msg);
+ void reportEventToSystemImpl(const char *msg) {
+ static ::HANDLE hEventLog = RegisterEventSource( NULL, TEXT("mongod") );
+ if( hEventLog ) {
+ std::wstring s = toNativeString(msg);
+ LPCTSTR txt = s.c_str();
+ BOOL ok = ReportEvent(
+ hEventLog, EVENTLOG_ERROR_TYPE,
+ 0, 0, NULL,
+ 1,
+ 0,
+ &txt,
+ 0);
+ wassert(ok);
+ }
+ }
+
+ void myPurecallHandler() {
+ printStackTrace();
+ mongoAbort("pure virtual");
+ }
+
+ void setupSignals( bool inFork ) {
+ reportEventToSystem = reportEventToSystemImpl;
+ filtLast = SetUnhandledExceptionFilter(exceptionFilter);
+ massert(10297 , "Couldn't register Windows Ctrl-C handler", SetConsoleCtrlHandler((PHANDLER_ROUTINE) CtrlHandler, TRUE));
+ _set_purecall_handler( myPurecallHandler );
+ }
+
+#endif
+
+} // namespace mongo
diff --git a/src/mongo/db/db.h b/src/mongo/db/db.h
new file mode 100644
index 00000000000..6a31a06f77c
--- /dev/null
+++ b/src/mongo/db/db.h
@@ -0,0 +1,120 @@
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "../util/net/message.h"
+#include "concurrency.h"
+#include "pdfile.h"
+#include "curop.h"
+#include "client.h"
+#include "databaseholder.h"
+
+namespace mongo {
+
+ struct dbtemprelease {
+ Client::Context * _context;
+ int _locktype;
+
+ dbtemprelease() {
+ const Client& c = cc();
+ _context = c.getContext();
+ _locktype = d.dbMutex.getState();
+ assert( _locktype );
+
+ if ( _locktype > 0 ) {
+ massert( 10298 , "can't temprelease nested write lock", _locktype == 1);
+ if ( _context ) _context->unlocked();
+ d.dbMutex.unlock();
+ }
+ else {
+ massert( 10299 , "can't temprelease nested read lock", _locktype == -1);
+ if ( _context ) _context->unlocked();
+ d.dbMutex.unlock_shared();
+ }
+
+ verify( 14814 , c.curop() );
+ c.curop()->yielded();
+
+ }
+ ~dbtemprelease() {
+ if ( _locktype > 0 )
+ d.dbMutex.lock();
+ else
+ d.dbMutex.lock_shared();
+
+ if ( _context ) _context->relocked();
+ }
+ };
+
+ /** must be write locked
+ no assert (and no release) if nested write lock
+ a lot like dbtempreleasecond but no malloc so should be a tiny bit faster
+ */
+ struct dbtempreleasewritelock {
+ Client::Context * _context;
+ int _locktype;
+ dbtempreleasewritelock() {
+ const Client& c = cc();
+ _context = c.getContext();
+ _locktype = d.dbMutex.getState();
+ assert( _locktype >= 1 );
+ if( _locktype > 1 )
+ return; // nested
+ if ( _context )
+ _context->unlocked();
+ d.dbMutex.unlock();
+ verify( 14845 , c.curop() );
+ c.curop()->yielded();
+ }
+ ~dbtempreleasewritelock() {
+ if ( _locktype == 1 )
+ d.dbMutex.lock();
+ if ( _context )
+ _context->relocked();
+ }
+ };
+
+ /**
+ only does a temp release if we're not nested and have a lock
+ */
+ struct dbtempreleasecond {
+ dbtemprelease * real;
+ int locktype;
+
+ dbtempreleasecond() {
+ real = 0;
+ locktype = d.dbMutex.getState();
+ if ( locktype == 1 || locktype == -1 )
+ real = new dbtemprelease();
+ }
+
+ ~dbtempreleasecond() {
+ if ( real ) {
+ delete real;
+ real = 0;
+ }
+ }
+
+ bool unlocked() {
+ return real != 0;
+ }
+ };
+
+} // namespace mongo
+
+#include "concurrency.h"
diff --git a/src/mongo/db/db.rc b/src/mongo/db/db.rc
new file mode 100755
index 00000000000..b589458cf73
--- /dev/null
+++ b/src/mongo/db/db.rc
@@ -0,0 +1,12 @@
+// Microsoft Visual C++ generated resource script.
+//
+#include "resource.h"
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Icon
+//
+// Icon with lowest ID value placed first to ensure application icon
+// remains consistent on all systems.
+IDI_ICON2 ICON "mongo.ico"
+///////////////////////////////////////////////////////////////////////////// \ No newline at end of file
diff --git a/src/mongo/db/db.vcxproj b/src/mongo/db/db.vcxproj
new file mode 100755
index 00000000000..8963f0af580
--- /dev/null
+++ b/src/mongo/db/db.vcxproj
@@ -0,0 +1,934 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|x64">
+ <Configuration>Release</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectName>mongod</ProjectName>
+ <ProjectGuid>{215B2D68-0A70-4D10-8E75-B31010C62A91}</ProjectGuid>
+ <RootNamespace>db</RootNamespace>
+ <Keyword>Win32Proj</Keyword>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <CharacterSet>Unicode</CharacterSet>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <CharacterSet>Unicode</CharacterSet>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseOfMfc>false</UseOfMfc>
+ <UseOfAtl>false</UseOfAtl>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseOfMfc>false</UseOfMfc>
+ <UseOfAtl>false</UseOfAtl>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup>
+ <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
+ <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+ <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(SolutionDir)$(Configuration)\</OutDir>
+ <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Configuration)\</IntDir>
+ <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(Configuration)\</IntDir>
+ <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
+ <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</LinkIncremental>
+ <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+ <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(SolutionDir)$(Configuration)\</OutDir>
+ <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</IntDir>
+ <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(Configuration)\</IntDir>
+ <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
+ <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</LinkIncremental>
+ <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+ <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
+ <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+ <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
+ <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+ <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
+ <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+ <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
+ <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+ <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
+ <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+ <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
+ <IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.;..;$(IncludePath)</IncludePath>
+ <IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..;$(IncludePath)</IncludePath>
+ <IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..;$(IncludePath)</IncludePath>
+ <IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..;$(IncludePath)</IncludePath>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <ClCompile>
+ <Optimization>Disabled</Optimization>
+ <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
+ <PreprocessorDefinitions>MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <MinimalRebuild>No</MinimalRebuild>
+ <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+ <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+ <DisableSpecificWarnings>4355;4800;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+ <MultiProcessorCompilation>true</MultiProcessorCompilation>
+ </ClCompile>
+ <Link>
+ <AdditionalDependencies>ws2_32.lib;Psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ <AdditionalLibraryDirectories>c:\boost\lib\vs2010_32;\boost\lib\vs2010_32;\boost\lib</AdditionalLibraryDirectories>
+ <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+ <IgnoreSpecificDefaultLibraries>%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <SubSystem>Console</SubSystem>
+ <TargetMachine>MachineX86</TargetMachine>
+ </Link>
+ <PreBuildEvent>
+ <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+ <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+ </PreBuildEvent>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <ClCompile>
+ <Optimization>Disabled</Optimization>
+ <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
+ <PreprocessorDefinitions>MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+ <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+ <DisableSpecificWarnings>4355;4800;4267;4244;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+ <MultiProcessorCompilation>true</MultiProcessorCompilation>
+ <MinimalRebuild>No</MinimalRebuild>
+ </ClCompile>
+ <Link>
+ <AdditionalDependencies>ws2_32.lib;Psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ <AdditionalLibraryDirectories>c:\boost\lib\vs2010_64;\boost\lib\vs2010_64;\boost\lib</AdditionalLibraryDirectories>
+ <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+ <IgnoreSpecificDefaultLibraries>%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <SubSystem>Console</SubSystem>
+ </Link>
+ <PreBuildEvent>
+ <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+ <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+ </PreBuildEvent>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <ClCompile>
+ <Optimization>MaxSpeed</Optimization>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
+ <PreprocessorDefinitions>_UNICODE;UNICODE;;;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+ <DisableSpecificWarnings>4355;4800;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+ <MultiProcessorCompilation>true</MultiProcessorCompilation>
+ <MinimalRebuild>No</MinimalRebuild>
+ </ClCompile>
+ <Link>
+ <AdditionalDependencies>ws2_32.lib;psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ <AdditionalLibraryDirectories>c:\boost\lib\vs2010_32;\boost\lib\vs2010_32;\boost\lib</AdditionalLibraryDirectories>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <SubSystem>Console</SubSystem>
+ <OptimizeReferences>true</OptimizeReferences>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <TargetMachine>MachineX86</TargetMachine>
+ <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+ </Link>
+ <PreBuildEvent>
+ <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+ <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+ </PreBuildEvent>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <ClCompile>
+ <Optimization>MaxSpeed</Optimization>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
+ <PreprocessorDefinitions>;;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+ <DisableSpecificWarnings>4355;4800;4267;4244;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+ <MultiProcessorCompilation>true</MultiProcessorCompilation>
+ <MinimalRebuild>No</MinimalRebuild>
+ </ClCompile>
+ <Link>
+ <AdditionalDependencies>ws2_32.lib;psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ <AdditionalLibraryDirectories>c:\boost\lib\vs2010_64;\boost\lib\vs2010_64;\boost\lib</AdditionalLibraryDirectories>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <SubSystem>Console</SubSystem>
+ <OptimizeReferences>true</OptimizeReferences>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ </Link>
+ <PreBuildEvent>
+ <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+ <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+ </PreBuildEvent>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClCompile Include="..\bson\oid.cpp" />
+ <ClCompile Include="..\client\dbclientcursor.cpp" />
+ <ClCompile Include="..\client\dbclient_rs.cpp" />
+ <ClCompile Include="..\client\distlock.cpp" />
+ <ClCompile Include="..\client\model.cpp" />
+ <ClCompile Include="..\s\default_version.cpp" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcrecpp.cc">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_chartables.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_compile.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_config.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_dfa_exec.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_exec.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_fullinfo.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_get.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_globals.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_info.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_maketables.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_newline.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_ord2utf8.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_refcount.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_scanner.cc">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_stringpiece.cc">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_study.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_tables.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_try_flipped.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_ucp_searchfuncs.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_valid_utf8.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_version.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_xclass.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcreposix.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\scripting\bench.cpp" />
+ <ClCompile Include="..\shell\mongo.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\s\chunk.cpp" />
+ <ClCompile Include="..\s\config.cpp" />
+ <ClCompile Include="..\s\d_chunk_manager.cpp" />
+ <ClCompile Include="..\s\d_migrate.cpp" />
+ <ClCompile Include="..\s\d_split.cpp" />
+ <ClCompile Include="..\s\d_state.cpp" />
+ <ClCompile Include="..\s\d_writeback.cpp" />
+ <ClCompile Include="..\s\grid.cpp" />
+ <ClCompile Include="..\s\shard.cpp" />
+ <ClCompile Include="..\s\shardconnection.cpp" />
+ <ClCompile Include="..\s\shardkey.cpp" />
+ <ClCompile Include="..\third_party\snappy\snappy-sinksource.cc">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\snappy\snappy.cc">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\util\alignedbuilder.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\util\compress.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\util\concurrency\spin_lock.cpp" />
+ <ClCompile Include="..\util\concurrency\synchronization.cpp" />
+ <ClCompile Include="..\util\concurrency\task.cpp" />
+ <ClCompile Include="..\util\concurrency\thread_pool.cpp" />
+ <ClCompile Include="..\util\concurrency\vars.cpp" />
+ <ClCompile Include="..\util\file_allocator.cpp" />
+ <ClCompile Include="..\util\intrusive_counter.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\util\log.cpp" />
+ <ClCompile Include="..\util\logfile.cpp" />
+ <ClCompile Include="..\util\net\listen.cpp" />
+ <ClCompile Include="..\util\net\miniwebserver.cpp" />
+ <ClCompile Include="..\util\processinfo.cpp" />
+ <ClCompile Include="..\util\ramlog.cpp" />
+ <ClCompile Include="..\util\stringutils.cpp" />
+ <ClCompile Include="..\util\systeminfo_win32.cpp" />
+ <ClCompile Include="..\util\text.cpp" />
+ <ClCompile Include="..\util\version.cpp" />
+ <ClCompile Include="btreebuilder.cpp" />
+ <ClCompile Include="cap.cpp" />
+ <ClCompile Include="commands\cloud.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="commands\distinct.cpp">
+ <PrecompiledHeader>NotUsing</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="commands\document_source_cursor.cpp" />
+ <ClCompile Include="commands\find_and_modify.cpp" />
+ <ClCompile Include="commands\group.cpp" />
+ <ClCompile Include="commands\isself.cpp" />
+ <ClCompile Include="commands\mr.cpp" />
+ <ClCompile Include="commands\pipeline_command.cpp" />
+ <ClCompile Include="commands\pipeline.cpp" />
+ <ClCompile Include="compact.cpp" />
+ <ClCompile Include="curop.cpp" />
+ <ClCompile Include="dbcommands_generic.cpp" />
+ <ClCompile Include="dbmessage.cpp" />
+ <ClCompile Include="dur.cpp" />
+ <ClCompile Include="durop.cpp" />
+ <ClCompile Include="dur_commitjob.cpp" />
+ <ClCompile Include="dur_journal.cpp" />
+ <ClCompile Include="dur_preplogbuffer.cpp" />
+ <ClCompile Include="dur_recover.cpp" />
+ <ClCompile Include="dur_writetodatafiles.cpp" />
+ <ClCompile Include="d_concurrency.cpp" />
+ <ClCompile Include="d_globals.cpp" />
+ <ClCompile Include="geo\2d.cpp" />
+ <ClCompile Include="geo\haystack.cpp" />
+ <ClCompile Include="key.cpp" />
+ <ClCompile Include="mongommf.cpp" />
+ <ClCompile Include="oplog.cpp" />
+ <ClCompile Include="ops\count.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="ops\delete.cpp" />
+ <ClCompile Include="ops\query.cpp" />
+ <ClCompile Include="ops\update.cpp" />
+ <ClCompile Include="pagefault.cpp" />
+ <ClCompile Include="pipeline\accumulator.cpp" />
+ <ClCompile Include="pipeline\accumulator_add_to_set.cpp" />
+ <ClCompile Include="pipeline\accumulator_avg.cpp" />
+ <ClCompile Include="pipeline\accumulator_first.cpp" />
+ <ClCompile Include="pipeline\accumulator_last.cpp" />
+ <ClCompile Include="pipeline\accumulator_min_max.cpp" />
+ <ClCompile Include="pipeline\accumulator_push.cpp" />
+ <ClCompile Include="pipeline\accumulator_single_value.cpp" />
+ <ClCompile Include="pipeline\accumulator_sum.cpp" />
+ <ClCompile Include="pipeline\builder.cpp" />
+ <ClCompile Include="pipeline\document.cpp" />
+ <ClCompile Include="pipeline\document_source.cpp" />
+ <ClCompile Include="pipeline\document_source_bson_array.cpp" />
+ <ClCompile Include="pipeline\document_source_command_futures.cpp" />
+ <ClCompile Include="pipeline\document_source_filter.cpp" />
+ <ClCompile Include="pipeline\document_source_filter_base.cpp" />
+ <ClCompile Include="pipeline\document_source_group.cpp" />
+ <ClCompile Include="pipeline\document_source_limit.cpp" />
+ <ClCompile Include="pipeline\document_source_match.cpp" />
+ <ClCompile Include="pipeline\document_source_out.cpp" />
+ <ClCompile Include="pipeline\document_source_project.cpp" />
+ <ClCompile Include="pipeline\document_source_skip.cpp" />
+ <ClCompile Include="pipeline\document_source_sort.cpp" />
+ <ClCompile Include="pipeline\document_source_unwind.cpp" />
+ <ClCompile Include="pipeline\doc_mem_monitor.cpp" />
+ <ClCompile Include="pipeline\expression.cpp" />
+ <ClCompile Include="pipeline\expression_context.cpp" />
+ <ClCompile Include="pipeline\field_path.cpp" />
+ <ClCompile Include="pipeline\value.cpp" />
+ <ClCompile Include="projection.cpp" />
+ <ClCompile Include="queryoptimizercursor.cpp" />
+ <ClCompile Include="querypattern.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="record.cpp" />
+ <ClCompile Include="repl.cpp" />
+ <ClCompile Include="repl\consensus.cpp" />
+ <ClCompile Include="repl\heartbeat.cpp" />
+ <ClCompile Include="repl\manager.cpp" />
+ <ClCompile Include="repl\rs_initialsync.cpp" />
+ <ClCompile Include="repl\rs_initiate.cpp" />
+ <ClCompile Include="repl\rs_rollback.cpp" />
+ <ClCompile Include="repl\rs_sync.cpp" />
+ <ClCompile Include="repl_block.cpp" />
+ <ClCompile Include="restapi.cpp" />
+ <ClCompile Include="..\client\connpool.cpp" />
+ <ClCompile Include="..\client\dbclient.cpp" />
+ <ClCompile Include="..\client\syncclusterconnection.cpp" />
+ <ClCompile Include="..\pch.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="client.cpp" />
+ <ClCompile Include="clientcursor.cpp" />
+ <ClCompile Include="cloner.cpp" />
+ <ClCompile Include="commands.cpp" />
+ <ClCompile Include="common.cpp">
+ <PrecompiledHeader>NotUsing</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="cursor.cpp" />
+ <ClCompile Include="database.cpp" />
+ <ClCompile Include="db.cpp" />
+ <ClCompile Include="dbcommands.cpp" />
+ <ClCompile Include="dbcommands_admin.cpp" />
+ <ClCompile Include="dbeval.cpp" />
+ <ClCompile Include="dbhelpers.cpp" />
+ <ClCompile Include="dbwebserver.cpp" />
+ <ClCompile Include="extsort.cpp" />
+ <ClCompile Include="index.cpp" />
+ <ClCompile Include="indexkey.cpp" />
+ <ClCompile Include="instance.cpp" />
+ <ClCompile Include="introspect.cpp" />
+ <ClCompile Include="jsobj.cpp" />
+ <ClCompile Include="json.cpp" />
+ <ClCompile Include="lasterror.cpp" />
+ <ClCompile Include="matcher.cpp" />
+ <ClCompile Include="matcher_covered.cpp" />
+ <ClCompile Include="..\util\mmap_win.cpp" />
+ <ClCompile Include="modules\mms.cpp" />
+ <ClCompile Include="module.cpp" />
+ <ClCompile Include="namespace.cpp" />
+ <ClCompile Include="nonce.cpp" />
+ <ClCompile Include="..\client\parallel.cpp" />
+ <ClCompile Include="pdfile.cpp" />
+ <ClCompile Include="queryoptimizer.cpp" />
+ <ClCompile Include="scanandorder.cpp" />
+ <ClCompile Include="security.cpp" />
+ <ClCompile Include="security_commands.cpp" />
+ <ClCompile Include="security_common.cpp" />
+ <ClCompile Include="tests.cpp" />
+ <ClCompile Include="cmdline.cpp" />
+ <ClCompile Include="queryutil.cpp" />
+ <ClCompile Include="..\util\assert_util.cpp" />
+ <ClCompile Include="..\util\background.cpp" />
+ <ClCompile Include="..\util\base64.cpp" />
+ <ClCompile Include="..\util\mmap.cpp" />
+ <ClCompile Include="..\util\ntservice.cpp" />
+ <ClCompile Include="..\util\processinfo_win32.cpp" />
+ <ClCompile Include="..\util\util.cpp" />
+ <ClCompile Include="..\util\net\httpclient.cpp" />
+ <ClCompile Include="..\util\md5.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeaderFile Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeaderFile>
+ <PrecompiledHeaderFile Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeaderFile>
+ </ClCompile>
+ <ClCompile Include="..\util\md5main.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Use</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Use</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\util\net\message.cpp" />
+ <ClCompile Include="..\util\net\message_port.cpp" />
+ <ClCompile Include="..\util\net\message_server_port.cpp" />
+ <ClCompile Include="..\util\net\sock.cpp" />
+ <ClCompile Include="..\s\d_logic.cpp" />
+ <ClCompile Include="..\scripting\engine.cpp" />
+ <ClCompile Include="..\scripting\engine_spidermonkey.cpp" />
+ <ClCompile Include="..\scripting\utils.cpp" />
+ <ClCompile Include="stats\counters.cpp" />
+ <ClCompile Include="stats\snapshots.cpp" />
+ <ClCompile Include="stats\top.cpp" />
+ <ClCompile Include="btree.cpp" />
+ <ClCompile Include="btreecursor.cpp" />
+ <ClCompile Include="repl\health.cpp" />
+ <ClCompile Include="repl\rs.cpp" />
+ <ClCompile Include="repl\replset_commands.cpp" />
+ <ClCompile Include="repl\rs_config.cpp" />
+ </ItemGroup>
+ <ItemGroup>
+ <None Include="..\jstests\dur\basic1.sh" />
+ <None Include="..\jstests\dur\dur1.js" />
+ <None Include="..\jstests\replsets\replset1.js" />
+ <None Include="..\jstests\replsets\replset2.js" />
+ <None Include="..\jstests\replsets\replset3.js" />
+ <None Include="..\jstests\replsets\replset4.js" />
+ <None Include="..\jstests\replsets\replset5.js" />
+ <None Include="..\jstests\replsets\replsetadd.js" />
+ <None Include="..\jstests\replsets\replsetarb1.js" />
+ <None Include="..\jstests\replsets\replsetarb2.js" />
+ <None Include="..\jstests\replsets\replsetprio1.js" />
+ <None Include="..\jstests\replsets\replsetrestart1.js" />
+ <None Include="..\jstests\replsets\replsetrestart2.js" />
+ <None Include="..\jstests\replsets\replset_remove_node.js" />
+ <None Include="..\jstests\replsets\rollback.js" />
+ <None Include="..\jstests\replsets\rollback2.js" />
+ <None Include="..\jstests\replsets\sync1.js" />
+ <None Include="..\jstests\replsets\twosets.js" />
+ <None Include="..\SConstruct" />
+ <None Include="..\util\mongoutils\README" />
+ <None Include="mongo.ico" />
+ <None Include="repl\notes.txt" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="..\bson\bson-inl.h" />
+ <ClInclude Include="..\bson\bson.h" />
+ <ClInclude Include="..\bson\bson_db.h" />
+ <ClInclude Include="..\bson\inline_decls.h" />
+ <ClInclude Include="..\bson\stringdata.h" />
+ <ClInclude Include="..\bson\util\atomic_int.h" />
+ <ClInclude Include="..\bson\util\builder.h" />
+ <ClInclude Include="..\bson\util\misc.h" />
+ <ClInclude Include="..\client\dbclientcursor.h" />
+ <ClInclude Include="..\client\distlock.h" />
+ <ClInclude Include="..\client\gridfs.h" />
+ <ClInclude Include="..\client\parallel.h" />
+ <ClInclude Include="..\s\d_logic.h" />
+ <ClInclude Include="..\targetver.h" />
+ <ClInclude Include="..\third_party\pcre-7.4\config.h" />
+ <ClInclude Include="..\third_party\pcre-7.4\pcre.h" />
+ <ClInclude Include="..\third_party\snappy\config.h" />
+ <ClInclude Include="..\third_party\snappy\snappy.h" />
+ <ClInclude Include="..\util\alignedbuilder.h" />
+ <ClInclude Include="..\util\concurrency\race.h" />
+ <ClInclude Include="..\util\concurrency\rwlock.h" />
+ <ClInclude Include="..\util\concurrency\msg.h" />
+ <ClInclude Include="..\util\concurrency\mutex.h" />
+ <ClInclude Include="..\util\concurrency\mvar.h" />
+ <ClInclude Include="..\util\concurrency\task.h" />
+ <ClInclude Include="..\util\concurrency\thread_pool.h" />
+ <ClInclude Include="..\util\intrusive_counter.h" />
+ <ClInclude Include="..\util\logfile.h" />
+ <ClInclude Include="..\util\mongoutils\checksum.h" />
+ <ClInclude Include="..\util\mongoutils\html.h" />
+ <ClInclude Include="..\util\mongoutils\str.h" />
+ <ClInclude Include="..\util\net\hostandport.h" />
+ <ClInclude Include="..\util\net\listen.h" />
+ <ClInclude Include="..\util\net\message_port.h" />
+ <ClInclude Include="..\util\net\miniwebserver.h" />
+ <ClInclude Include="..\util\paths.h" />
+ <ClInclude Include="..\util\ramlog.h" />
+ <ClInclude Include="..\util\systeminfo.h" />
+ <ClInclude Include="..\util\text.h" />
+ <ClInclude Include="..\util\time_support.h" />
+ <ClInclude Include="databaseholder.h" />
+ <ClInclude Include="durop.h" />
+ <ClInclude Include="dur_commitjob.h" />
+ <ClInclude Include="dur_journal.h" />
+ <ClInclude Include="dur_journalformat.h" />
+ <ClInclude Include="dur_journalimpl.h" />
+ <ClInclude Include="dur_stats.h" />
+ <ClInclude Include="d_globals.h" />
+ <ClInclude Include="geo\core.h" />
+ <ClInclude Include="globals.h" />
+ <ClInclude Include="helpers\dblogger.h" />
+ <ClInclude Include="instance.h" />
+ <ClInclude Include="mongommf.h" />
+ <ClInclude Include="mongomutex.h" />
+ <ClInclude Include="namespace-inl.h" />
+ <ClInclude Include="namespacestring.h" />
+ <ClInclude Include="oplogreader.h" />
+ <ClInclude Include="ops\count.h" />
+ <ClInclude Include="ops\delete.h" />
+ <ClInclude Include="ops\update.h" />
+ <ClInclude Include="pagefault.h" />
+ <ClInclude Include="pipeline\accumulator.h" />
+ <ClInclude Include="pipeline\builder.h" />
+ <ClInclude Include="pipeline\document.h" />
+ <ClInclude Include="pipeline\document_source.h" />
+ <ClInclude Include="pipeline\doc_mem_monitor.h" />
+ <ClInclude Include="pipeline\expression.h" />
+ <ClInclude Include="pipeline\expression_context.h" />
+ <ClInclude Include="pipeline\field_path.h" />
+ <ClInclude Include="pipeline\value.h" />
+ <ClInclude Include="projection.h" />
+ <ClInclude Include="queryutil.h" />
+ <ClInclude Include="repl.h" />
+ <ClInclude Include="replpair.h" />
+ <ClInclude Include="repl\connections.h" />
+ <ClInclude Include="repl\multicmd.h" />
+ <ClInclude Include="repl\rsmember.h" />
+ <ClInclude Include="repl\rs_optime.h" />
+ <ClInclude Include="stats\counters.h" />
+ <ClInclude Include="stats\snapshots.h" />
+ <ClInclude Include="stats\top.h" />
+ <ClInclude Include="..\client\connpool.h" />
+ <ClInclude Include="..\client\dbclient.h" />
+ <ClInclude Include="..\client\model.h" />
+ <ClInclude Include="..\client\redef_macros.h" />
+ <ClInclude Include="..\client\syncclusterconnection.h" />
+ <ClInclude Include="..\client\undef_macros.h" />
+ <ClInclude Include="background.h" />
+ <ClInclude Include="client.h" />
+ <ClInclude Include="clientcursor.h" />
+ <ClInclude Include="cmdline.h" />
+ <ClInclude Include="commands.h" />
+ <ClInclude Include="concurrency.h" />
+ <ClInclude Include="curop.h" />
+ <ClInclude Include="cursor.h" />
+ <ClInclude Include="database.h" />
+ <ClInclude Include="db.h" />
+ <ClInclude Include="dbhelpers.h" />
+ <ClInclude Include="dbinfo.h" />
+ <ClInclude Include="dbmessage.h" />
+ <ClInclude Include="diskloc.h" />
+ <ClInclude Include="index.h" />
+ <ClInclude Include="indexkey.h" />
+ <ClInclude Include="introspect.h" />
+ <ClInclude Include="json.h" />
+ <ClInclude Include="matcher.h" />
+ <ClInclude Include="namespace.h" />
+ <ClInclude Include="..\pch.h" />
+ <ClInclude Include="pdfile.h" />
+ <ClInclude Include="..\grid\protocol.h" />
+ <ClInclude Include="query.h" />
+ <ClInclude Include="queryoptimizer.h" />
+ <ClInclude Include="resource.h" />
+ <ClInclude Include="scanandorder.h" />
+ <ClInclude Include="security.h" />
+ <ClInclude Include="..\util\allocator.h" />
+ <ClInclude Include="..\util\array.h" />
+ <ClInclude Include="..\util\assert_util.h" />
+ <ClInclude Include="..\util\background.h" />
+ <ClInclude Include="..\util\base64.h" />
+ <ClInclude Include="..\util\builder.h" />
+ <ClInclude Include="..\util\debug_util.h" />
+ <ClInclude Include="..\util\embedded_builder.h" />
+ <ClInclude Include="..\util\file.h" />
+ <ClInclude Include="..\util\file_allocator.h" />
+ <ClInclude Include="..\util\goodies.h" />
+ <ClInclude Include="..\util\hashtab.h" />
+ <ClInclude Include="..\util\hex.h" />
+ <ClInclude Include="lasterror.h" />
+ <ClInclude Include="..\util\log.h" />
+ <ClInclude Include="..\util\lruishmap.h" />
+ <ClInclude Include="..\util\mmap.h" />
+ <ClInclude Include="..\util\ntservice.h" />
+ <ClInclude Include="..\util\optime.h" />
+ <ClInclude Include="..\util\processinfo.h" />
+ <ClInclude Include="..\util\queue.h" />
+ <ClInclude Include="..\util\ramstore.h" />
+ <ClInclude Include="..\util\unittest.h" />
+ <ClInclude Include="..\util\concurrency\list.h" />
+ <ClInclude Include="..\util\concurrency\value.h" />
+ <ClInclude Include="..\util\web\html.h" />
+ <ClInclude Include="..\util\net\httpclient.h" />
+ <ClInclude Include="..\util\md5.h" />
+ <ClInclude Include="..\util\md5.hpp" />
+ <ClInclude Include="..\util\net\message.h" />
+ <ClInclude Include="..\util\net\message_server.h" />
+ <ClInclude Include="..\util\net\sock.h" />
+ <ClInclude Include="..\scripting\engine.h" />
+ <ClInclude Include="..\scripting\engine_spidermonkey.h" />
+ <ClInclude Include="..\scripting\engine_v8.h" />
+ <ClInclude Include="..\scripting\v8_db.h" />
+ <ClInclude Include="..\scripting\v8_utils.h" />
+ <ClInclude Include="..\scripting\v8_wrapper.h" />
+ <ClInclude Include="btree.h" />
+ <ClInclude Include="repl\health.h" />
+ <ClInclude Include="repl\rs.h" />
+ <ClInclude Include="repl\rs_config.h" />
+ <ClInclude Include="..\bson\bsonelement.h" />
+ <ClInclude Include="..\bson\bsoninlines.h" />
+ <ClInclude Include="..\bson\bsonmisc.h" />
+ <ClInclude Include="..\bson\bsonobj.h" />
+ <ClInclude Include="..\bson\bsonobjbuilder.h" />
+ <ClInclude Include="..\bson\bsonobjiterator.h" />
+ <ClInclude Include="..\bson\bsontypes.h" />
+ <ClInclude Include="jsobj.h" />
+ <ClInclude Include="..\bson\oid.h" />
+ <ClInclude Include="..\bson\ordering.h" />
+ </ItemGroup>
+ <ItemGroup>
+ <Library Include="..\..\js\js32d.lib">
+ <FileType>Document</FileType>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+ </Library>
+ <Library Include="..\..\js\js32r.lib">
+ <FileType>Document</FileType>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+ </Library>
+ <Library Include="..\..\js\js64d.lib">
+ <FileType>Document</FileType>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+ </Library>
+ <Library Include="..\..\js\js64r.lib">
+ <FileType>Document</FileType>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+ </Library>
+ </ItemGroup>
+ <ItemGroup>
+ <ResourceCompile Include="db.rc" />
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
+</Project> \ No newline at end of file
diff --git a/src/mongo/db/db.vcxproj.filters b/src/mongo/db/db.vcxproj.filters
new file mode 100755
index 00000000000..a39df0dc796
--- /dev/null
+++ b/src/mongo/db/db.vcxproj.filters
@@ -0,0 +1,432 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup>
+ <ClCompile Include="..\bson\oid.cpp" />
+ <ClCompile Include="..\client\dbclientcursor.cpp" />
+ <ClCompile Include="..\client\dbclient_rs.cpp" />
+ <ClCompile Include="..\client\distlock.cpp" />
+ <ClCompile Include="..\client\model.cpp" />
+ <ClCompile Include="..\scripting\bench.cpp" />
+ <ClCompile Include="..\shell\mongo.cpp" />
+ <ClCompile Include="..\s\chunk.cpp" />
+ <ClCompile Include="..\s\config.cpp" />
+ <ClCompile Include="..\s\d_chunk_manager.cpp" />
+ <ClCompile Include="..\s\d_migrate.cpp" />
+ <ClCompile Include="..\s\d_split.cpp" />
+ <ClCompile Include="..\s\d_state.cpp" />
+ <ClCompile Include="..\s\d_writeback.cpp" />
+ <ClCompile Include="..\s\grid.cpp" />
+ <ClCompile Include="..\s\shard.cpp" />
+ <ClCompile Include="..\s\shardconnection.cpp" />
+ <ClCompile Include="..\s\shardkey.cpp" />
+ <ClCompile Include="..\util\alignedbuilder.cpp" />
+ <ClCompile Include="..\util\concurrency\spin_lock.cpp" />
+ <ClCompile Include="..\util\concurrency\synchronization.cpp" />
+ <ClCompile Include="..\util\concurrency\task.cpp" />
+ <ClCompile Include="..\util\concurrency\thread_pool.cpp" />
+ <ClCompile Include="..\util\concurrency\vars.cpp" />
+ <ClCompile Include="..\util\log.cpp" />
+ <ClCompile Include="..\util\logfile.cpp" />
+ <ClCompile Include="..\util\processinfo.cpp" />
+ <ClCompile Include="..\util\stringutils.cpp" />
+ <ClCompile Include="..\util\text.cpp" />
+ <ClCompile Include="..\util\version.cpp" />
+ <ClCompile Include="cap.cpp" />
+ <ClCompile Include="commands\distinct.cpp" />
+ <ClCompile Include="commands\group.cpp" />
+ <ClCompile Include="commands\isself.cpp" />
+ <ClCompile Include="commands\mr.cpp" />
+ <ClCompile Include="compact.cpp" />
+ <ClCompile Include="dbcommands_generic.cpp" />
+ <ClCompile Include="dur.cpp" />
+ <ClCompile Include="durop.cpp" />
+ <ClCompile Include="dur_commitjob.cpp" />
+ <ClCompile Include="dur_journal.cpp" />
+ <ClCompile Include="dur_preplogbuffer.cpp" />
+ <ClCompile Include="dur_recover.cpp" />
+ <ClCompile Include="dur_writetodatafiles.cpp" />
+ <ClCompile Include="geo\2d.cpp" />
+ <ClCompile Include="geo\haystack.cpp" />
+ <ClCompile Include="mongommf.cpp" />
+ <ClCompile Include="oplog.cpp" />
+ <ClCompile Include="projection.cpp" />
+ <ClCompile Include="repl.cpp" />
+ <ClCompile Include="repl\consensus.cpp" />
+ <ClCompile Include="repl\heartbeat.cpp" />
+ <ClCompile Include="repl\manager.cpp" />
+ <ClCompile Include="repl\rs_initialsync.cpp" />
+ <ClCompile Include="repl\rs_initiate.cpp" />
+ <ClCompile Include="repl\rs_rollback.cpp" />
+ <ClCompile Include="repl\rs_sync.cpp" />
+ <ClCompile Include="repl_block.cpp" />
+ <ClCompile Include="restapi.cpp" />
+ <ClCompile Include="..\client\connpool.cpp" />
+ <ClCompile Include="..\client\dbclient.cpp" />
+ <ClCompile Include="..\client\syncclusterconnection.cpp" />
+ <ClCompile Include="..\pch.cpp" />
+ <ClCompile Include="client.cpp" />
+ <ClCompile Include="clientcursor.cpp" />
+ <ClCompile Include="cloner.cpp" />
+ <ClCompile Include="commands.cpp" />
+ <ClCompile Include="common.cpp" />
+ <ClCompile Include="cursor.cpp" />
+ <ClCompile Include="database.cpp" />
+ <ClCompile Include="db.cpp" />
+ <ClCompile Include="dbcommands.cpp" />
+ <ClCompile Include="dbcommands_admin.cpp" />
+ <ClCompile Include="dbeval.cpp" />
+ <ClCompile Include="dbhelpers.cpp" />
+ <ClCompile Include="dbwebserver.cpp" />
+ <ClCompile Include="extsort.cpp" />
+ <ClCompile Include="index.cpp" />
+ <ClCompile Include="indexkey.cpp" />
+ <ClCompile Include="instance.cpp" />
+ <ClCompile Include="introspect.cpp" />
+ <ClCompile Include="jsobj.cpp" />
+ <ClCompile Include="json.cpp" />
+ <ClCompile Include="lasterror.cpp" />
+ <ClCompile Include="matcher.cpp" />
+ <ClCompile Include="matcher_covered.cpp" />
+ <ClCompile Include="..\util\mmap_win.cpp" />
+ <ClCompile Include="modules\mms.cpp" />
+ <ClCompile Include="module.cpp" />
+ <ClCompile Include="namespace.cpp" />
+ <ClCompile Include="nonce.cpp" />
+ <ClCompile Include="..\client\parallel.cpp" />
+ <ClCompile Include="pdfile.cpp" />
+ <ClCompile Include="queryoptimizer.cpp" />
+ <ClCompile Include="security.cpp" />
+ <ClCompile Include="security_commands.cpp" />
+ <ClCompile Include="tests.cpp" />
+ <ClCompile Include="cmdline.cpp" />
+ <ClCompile Include="queryutil.cpp" />
+ <ClCompile Include="..\util\assert_util.cpp" />
+ <ClCompile Include="..\util\background.cpp" />
+ <ClCompile Include="..\util\base64.cpp" />
+ <ClCompile Include="..\util\mmap.cpp" />
+ <ClCompile Include="..\util\ntservice.cpp" />
+ <ClCompile Include="..\util\processinfo_win32.cpp" />
+ <ClCompile Include="..\util\util.cpp" />
+ <ClCompile Include="..\util\md5.c" />
+ <ClCompile Include="..\util\md5main.cpp" />
+ <ClCompile Include="..\s\d_logic.cpp" />
+ <ClCompile Include="..\scripting\engine.cpp" />
+ <ClCompile Include="..\scripting\engine_spidermonkey.cpp" />
+ <ClCompile Include="..\scripting\utils.cpp" />
+ <ClCompile Include="stats\counters.cpp" />
+ <ClCompile Include="stats\snapshots.cpp" />
+ <ClCompile Include="stats\top.cpp" />
+ <ClCompile Include="btree.cpp" />
+ <ClCompile Include="btreecursor.cpp" />
+ <ClCompile Include="repl\health.cpp" />
+ <ClCompile Include="repl\rs.cpp" />
+ <ClCompile Include="repl\replset_commands.cpp" />
+ <ClCompile Include="repl\rs_config.cpp" />
+ <ClCompile Include="..\util\file_allocator.cpp" />
+ <ClCompile Include="querypattern.cpp" />
+ <ClCompile Include="..\util\ramlog.cpp" />
+ <ClCompile Include="key.cpp" />
+ <ClCompile Include="btreebuilder.cpp" />
+ <ClCompile Include="queryoptimizercursor.cpp" />
+ <ClCompile Include="record.cpp" />
+ <ClCompile Include="ops\delete.cpp" />
+ <ClCompile Include="ops\update.cpp" />
+ <ClCompile Include="security_common.cpp" />
+ <ClCompile Include="ops\query.cpp" />
+ <ClCompile Include="..\util\net\httpclient.cpp" />
+ <ClCompile Include="..\util\net\message.cpp" />
+ <ClCompile Include="..\util\net\message_server_port.cpp" />
+ <ClCompile Include="..\util\net\sock.cpp" />
+ <ClCompile Include="..\util\net\miniwebserver.cpp" />
+ <ClCompile Include="..\util\net\listen.cpp" />
+ <ClCompile Include="..\util\net\message_port.cpp" />
+ <ClCompile Include="dbmessage.cpp" />
+ <ClCompile Include="commands\find_and_modify.cpp" />
+ <ClCompile Include="..\util\compress.cpp">
+ <Filter>snappy</Filter>
+ </ClCompile>
+ <ClCompile Include="..\third_party\snappy\snappy-sinksource.cc">
+ <Filter>snappy</Filter>
+ </ClCompile>
+ <ClCompile Include="..\third_party\snappy\snappy.cc">
+ <Filter>snappy</Filter>
+ </ClCompile>
+ <ClCompile Include="scanandorder.cpp" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcrecpp.cc" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_chartables.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_compile.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_config.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_dfa_exec.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_exec.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_fullinfo.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_get.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_globals.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_info.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_maketables.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_newline.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_ord2utf8.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_refcount.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_scanner.cc" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_stringpiece.cc" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_study.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_tables.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_try_flipped.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_ucp_searchfuncs.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_valid_utf8.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_version.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_xclass.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcreposix.c" />
+ <ClCompile Include="commands\cloud.cpp" />
+ <ClCompile Include="commands\pipeline_command.cpp" />
+ <ClCompile Include="commands\pipeline.cpp" />
+ <ClCompile Include="pipeline\accumulator.cpp" />
+ <ClCompile Include="pipeline\accumulator_add_to_set.cpp" />
+ <ClCompile Include="pipeline\accumulator_avg.cpp" />
+ <ClCompile Include="pipeline\accumulator_first.cpp" />
+ <ClCompile Include="pipeline\accumulator_last.cpp" />
+ <ClCompile Include="pipeline\accumulator_min_max.cpp" />
+ <ClCompile Include="pipeline\accumulator_push.cpp" />
+ <ClCompile Include="pipeline\accumulator_single_value.cpp" />
+ <ClCompile Include="pipeline\accumulator_sum.cpp" />
+ <ClCompile Include="pipeline\builder.cpp" />
+ <ClCompile Include="pipeline\doc_mem_monitor.cpp" />
+ <ClCompile Include="pipeline\document.cpp" />
+ <ClCompile Include="pipeline\document_source.cpp" />
+ <ClCompile Include="pipeline\document_source_bson_array.cpp" />
+ <ClCompile Include="pipeline\document_source_command_futures.cpp" />
+ <ClCompile Include="pipeline\document_source_filter.cpp" />
+ <ClCompile Include="pipeline\document_source_filter_base.cpp" />
+ <ClCompile Include="pipeline\document_source_group.cpp" />
+ <ClCompile Include="pipeline\document_source_limit.cpp" />
+ <ClCompile Include="pipeline\document_source_match.cpp" />
+ <ClCompile Include="pipeline\document_source_out.cpp" />
+ <ClCompile Include="pipeline\document_source_project.cpp" />
+ <ClCompile Include="pipeline\document_source_skip.cpp" />
+ <ClCompile Include="pipeline\document_source_sort.cpp" />
+ <ClCompile Include="pipeline\document_source_unwind.cpp" />
+ <ClCompile Include="pipeline\expression.cpp" />
+ <ClCompile Include="pipeline\expression_context.cpp" />
+ <ClCompile Include="pipeline\field_path.cpp" />
+ <ClCompile Include="pipeline\value.cpp" />
+ <ClCompile Include="..\util\intrusive_counter.cpp" />
+ <ClCompile Include="..\util\systeminfo_win32.cpp" />
+ <ClCompile Include="commands\document_source_cursor.cpp" />
+ <ClCompile Include="d_concurrency.cpp" />
+ <ClCompile Include="..\s\default_version.cpp" />
+ <ClCompile Include="ops\count.cpp" />
+ <ClCompile Include="pagefault.cpp" />
+ <ClCompile Include="d_globals.cpp" />
+ <ClCompile Include="curop.cpp" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="..\client\dbclientcursor.h" />
+ <ClInclude Include="..\client\distlock.h" />
+ <ClInclude Include="..\client\gridfs.h" />
+ <ClInclude Include="..\client\parallel.h" />
+ <ClInclude Include="..\s\d_logic.h" />
+ <ClInclude Include="..\targetver.h" />
+ <ClInclude Include="..\util\concurrency\rwlock.h" />
+ <ClInclude Include="..\util\concurrency\msg.h" />
+ <ClInclude Include="..\util\concurrency\mutex.h" />
+ <ClInclude Include="..\util\concurrency\mvar.h" />
+ <ClInclude Include="..\util\concurrency\task.h" />
+ <ClInclude Include="..\util\concurrency\thread_pool.h" />
+ <ClInclude Include="..\util\logfile.h" />
+ <ClInclude Include="..\util\mongoutils\checksum.h" />
+ <ClInclude Include="..\util\mongoutils\html.h" />
+ <ClInclude Include="..\util\mongoutils\str.h" />
+ <ClInclude Include="..\util\paths.h" />
+ <ClInclude Include="..\util\ramlog.h" />
+ <ClInclude Include="..\util\text.h" />
+ <ClInclude Include="..\util\time_support.h" />
+ <ClInclude Include="durop.h" />
+ <ClInclude Include="dur_commitjob.h" />
+ <ClInclude Include="dur_journal.h" />
+ <ClInclude Include="dur_journalformat.h" />
+ <ClInclude Include="dur_stats.h" />
+ <ClInclude Include="geo\core.h" />
+ <ClInclude Include="helpers\dblogger.h" />
+ <ClInclude Include="instance.h" />
+ <ClInclude Include="mongommf.h" />
+ <ClInclude Include="mongomutex.h" />
+ <ClInclude Include="namespace-inl.h" />
+ <ClInclude Include="oplogreader.h" />
+ <ClInclude Include="projection.h" />
+ <ClInclude Include="repl.h" />
+ <ClInclude Include="replpair.h" />
+ <ClInclude Include="repl\connections.h" />
+ <ClInclude Include="repl\multicmd.h" />
+ <ClInclude Include="repl\rsmember.h" />
+ <ClInclude Include="repl\rs_optime.h" />
+ <ClInclude Include="stats\counters.h" />
+ <ClInclude Include="stats\snapshots.h" />
+ <ClInclude Include="stats\top.h" />
+ <ClInclude Include="..\client\connpool.h" />
+ <ClInclude Include="..\client\dbclient.h" />
+ <ClInclude Include="..\client\model.h" />
+ <ClInclude Include="..\client\redef_macros.h" />
+ <ClInclude Include="..\client\syncclusterconnection.h" />
+ <ClInclude Include="..\client\undef_macros.h" />
+ <ClInclude Include="background.h" />
+ <ClInclude Include="client.h" />
+ <ClInclude Include="clientcursor.h" />
+ <ClInclude Include="cmdline.h" />
+ <ClInclude Include="commands.h" />
+ <ClInclude Include="concurrency.h" />
+ <ClInclude Include="curop.h" />
+ <ClInclude Include="cursor.h" />
+ <ClInclude Include="database.h" />
+ <ClInclude Include="db.h" />
+ <ClInclude Include="dbhelpers.h" />
+ <ClInclude Include="dbinfo.h" />
+ <ClInclude Include="dbmessage.h" />
+ <ClInclude Include="diskloc.h" />
+ <ClInclude Include="index.h" />
+ <ClInclude Include="indexkey.h" />
+ <ClInclude Include="introspect.h" />
+ <ClInclude Include="json.h" />
+ <ClInclude Include="matcher.h" />
+ <ClInclude Include="namespace.h" />
+ <ClInclude Include="..\pch.h" />
+ <ClInclude Include="pdfile.h" />
+ <ClInclude Include="..\grid\protocol.h" />
+ <ClInclude Include="query.h" />
+ <ClInclude Include="queryoptimizer.h" />
+ <ClInclude Include="resource.h" />
+ <ClInclude Include="scanandorder.h" />
+ <ClInclude Include="security.h" />
+ <ClInclude Include="..\util\allocator.h" />
+ <ClInclude Include="..\util\array.h" />
+ <ClInclude Include="..\util\assert_util.h" />
+ <ClInclude Include="..\util\background.h" />
+ <ClInclude Include="..\util\base64.h" />
+ <ClInclude Include="..\util\builder.h" />
+ <ClInclude Include="..\util\debug_util.h" />
+ <ClInclude Include="..\util\embedded_builder.h" />
+ <ClInclude Include="..\util\file.h" />
+ <ClInclude Include="..\util\file_allocator.h" />
+ <ClInclude Include="..\util\goodies.h" />
+ <ClInclude Include="..\util\hashtab.h" />
+ <ClInclude Include="..\util\hex.h" />
+ <ClInclude Include="lasterror.h" />
+ <ClInclude Include="..\util\log.h" />
+ <ClInclude Include="..\util\lruishmap.h" />
+ <ClInclude Include="..\util\mmap.h" />
+ <ClInclude Include="..\util\ntservice.h" />
+ <ClInclude Include="..\util\optime.h" />
+ <ClInclude Include="..\util\processinfo.h" />
+ <ClInclude Include="..\util\queue.h" />
+ <ClInclude Include="..\util\ramstore.h" />
+ <ClInclude Include="..\util\unittest.h" />
+ <ClInclude Include="..\util\concurrency\list.h" />
+ <ClInclude Include="..\util\concurrency\value.h" />
+ <ClInclude Include="..\util\web\html.h" />
+ <ClInclude Include="..\util\md5.h" />
+ <ClInclude Include="..\util\md5.hpp" />
+ <ClInclude Include="..\scripting\engine.h" />
+ <ClInclude Include="..\scripting\engine_spidermonkey.h" />
+ <ClInclude Include="..\scripting\engine_v8.h" />
+ <ClInclude Include="..\scripting\v8_db.h" />
+ <ClInclude Include="..\scripting\v8_utils.h" />
+ <ClInclude Include="..\scripting\v8_wrapper.h" />
+ <ClInclude Include="btree.h" />
+ <ClInclude Include="repl\health.h" />
+ <ClInclude Include="repl\rs.h" />
+ <ClInclude Include="repl\rs_config.h" />
+ <ClInclude Include="..\bson\bsonelement.h" />
+ <ClInclude Include="..\bson\bsoninlines.h" />
+ <ClInclude Include="..\bson\bsonmisc.h" />
+ <ClInclude Include="..\bson\bsonobj.h" />
+ <ClInclude Include="..\bson\bsonobjbuilder.h" />
+ <ClInclude Include="..\bson\bsonobjiterator.h" />
+ <ClInclude Include="..\bson\bsontypes.h" />
+ <ClInclude Include="jsobj.h" />
+ <ClInclude Include="..\bson\oid.h" />
+ <ClInclude Include="..\bson\ordering.h" />
+ <ClInclude Include="dur_journalimpl.h" />
+ <ClInclude Include="..\util\concurrency\race.h" />
+ <ClInclude Include="..\util\alignedbuilder.h" />
+ <ClInclude Include="queryutil.h" />
+ <ClInclude Include="..\bson\bson.h" />
+ <ClInclude Include="..\bson\bson_db.h" />
+ <ClInclude Include="..\bson\bson-inl.h" />
+ <ClInclude Include="..\bson\inline_decls.h" />
+ <ClInclude Include="..\bson\stringdata.h" />
+ <ClInclude Include="..\bson\util\atomic_int.h" />
+ <ClInclude Include="..\bson\util\builder.h" />
+ <ClInclude Include="..\bson\util\misc.h" />
+ <ClInclude Include="ops\delete.h" />
+ <ClInclude Include="ops\update.h" />
+ <ClInclude Include="..\util\net\httpclient.h" />
+ <ClInclude Include="..\util\net\message.h" />
+ <ClInclude Include="..\util\net\message_server.h" />
+ <ClInclude Include="..\util\net\sock.h" />
+ <ClInclude Include="..\third_party\snappy\config.h">
+ <Filter>snappy</Filter>
+ </ClInclude>
+ <ClInclude Include="..\third_party\snappy\snappy.h">
+ <Filter>snappy</Filter>
+ </ClInclude>
+ <ClInclude Include="..\third_party\pcre-7.4\config.h" />
+ <ClInclude Include="..\third_party\pcre-7.4\pcre.h" />
+ <ClInclude Include="globals.h" />
+ <ClInclude Include="..\util\net\hostandport.h" />
+ <ClInclude Include="..\util\net\listen.h" />
+ <ClInclude Include="..\util\net\message_port.h" />
+ <ClInclude Include="..\util\net\miniwebserver.h" />
+ <ClInclude Include="databaseholder.h" />
+ <ClInclude Include="pipeline\accumulator.h" />
+ <ClInclude Include="pipeline\builder.h" />
+ <ClInclude Include="pipeline\doc_mem_monitor.h" />
+ <ClInclude Include="pipeline\document.h" />
+ <ClInclude Include="pipeline\document_source.h" />
+ <ClInclude Include="pipeline\expression.h" />
+ <ClInclude Include="pipeline\expression_context.h" />
+ <ClInclude Include="pipeline\field_path.h" />
+ <ClInclude Include="pipeline\value.h" />
+ <ClInclude Include="..\util\intrusive_counter.h" />
+ <ClInclude Include="..\util\systeminfo.h" />
+ <ClInclude Include="namespacestring.h" />
+ <ClInclude Include="ops\count.h" />
+ <ClInclude Include="pagefault.h" />
+ <ClInclude Include="d_globals.h" />
+ </ItemGroup>
+ <ItemGroup>
+ <ResourceCompile Include="db.rc" />
+ </ItemGroup>
+ <ItemGroup>
+ <None Include="..\jstests\dur\basic1.sh" />
+ <None Include="..\jstests\dur\dur1.js" />
+ <None Include="..\jstests\replsets\replset1.js" />
+ <None Include="..\jstests\replsets\replset2.js" />
+ <None Include="..\jstests\replsets\replset3.js" />
+ <None Include="..\jstests\replsets\replset4.js" />
+ <None Include="..\jstests\replsets\replset5.js" />
+ <None Include="..\jstests\replsets\replsetadd.js" />
+ <None Include="..\jstests\replsets\replsetarb1.js" />
+ <None Include="..\jstests\replsets\replsetarb2.js" />
+ <None Include="..\jstests\replsets\replsetprio1.js" />
+ <None Include="..\jstests\replsets\replsetrestart1.js" />
+ <None Include="..\jstests\replsets\replsetrestart2.js" />
+ <None Include="..\jstests\replsets\replset_remove_node.js" />
+ <None Include="..\jstests\replsets\rollback.js" />
+ <None Include="..\jstests\replsets\rollback2.js" />
+ <None Include="..\jstests\replsets\sync1.js" />
+ <None Include="..\jstests\replsets\twosets.js" />
+ <None Include="..\SConstruct" />
+ <None Include="..\util\mongoutils\README" />
+ <None Include="mongo.ico" />
+ <None Include="repl\notes.txt" />
+ </ItemGroup>
+ <ItemGroup>
+ <Library Include="..\..\js\js32d.lib" />
+ <Library Include="..\..\js\js32r.lib" />
+ <Library Include="..\..\js\js64d.lib" />
+ <Library Include="..\..\js\js64r.lib" />
+ </ItemGroup>
+ <ItemGroup>
+ <Filter Include="snappy">
+ <UniqueIdentifier>{bb99c086-7926-4f50-838d-f5f0c18397c0}</UniqueIdentifier>
+ </Filter>
+ </ItemGroup>
+</Project> \ No newline at end of file
diff --git a/src/mongo/db/db_10.sln b/src/mongo/db/db_10.sln
new file mode 100755
index 00000000000..c1d83f3901a
--- /dev/null
+++ b/src/mongo/db/db_10.sln
@@ -0,0 +1,168 @@
+
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "examples", "examples", "{4082881B-EB00-486F-906C-843B8EC06E18}"
+ ProjectSection(SolutionItems) = preProject
+ driverHelpers.cpp = driverHelpers.cpp
+ EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tools", "tools", "{2B262D59-9DC7-4BF1-A431-1BD4966899A5}"
+ ProjectSection(SolutionItems) = preProject
+ ..\shell\msvc\createCPPfromJavaScriptFiles.js = ..\shell\msvc\createCPPfromJavaScriptFiles.js
+ EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "unix files", "unix files", "{2F760952-C71B-4865-998F-AABAE96D1373}"
+ ProjectSection(SolutionItems) = preProject
+ ..\util\processinfo_darwin.cpp = ..\util\processinfo_darwin.cpp
+ ..\util\processinfo_linux2.cpp = ..\util\processinfo_linux2.cpp
+ ..\util\processinfo_none.cpp = ..\util\processinfo_none.cpp
+ EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "other", "other", "{12B11474-2D74-48C3-BB3D-F03249BEA88F}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongod", "db.vcxproj", "{215B2D68-0A70-4D10-8E75-B31010C62A91}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongos", "..\s\dbgrid.vcxproj", "{E03717ED-69B4-4D21-BC55-DF6690B585C6}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "test", "..\dbtests\test.vcxproj", "{215B2D68-0A70-4D10-8E75-B33010C62A91}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bsondemo", "..\bson\bsondemo\bsondemo.vcxproj", "{C9DB5EB7-81AA-4185-BAA1-DA035654402F}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongoutils test program", "..\util\mongoutils\mongoutils.vcxproj", "{7B84584E-92BC-4DB9-971B-A1A8F93E5053}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simple_client_demo", "..\client\examples\simple_client_demo.vcxproj", "{89C30BC3-2874-4F2C-B4DA-EB04E9782236}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongo", "..\shell\msvc\mongo.vcxproj", "{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongoperf", "..\client\examples\mongoperf.vcxproj", "{79D4E297-BFB7-4FF2-9B13-08A146582E46}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Any CPU = Debug|Any CPU
+ Debug|Mixed Platforms = Debug|Mixed Platforms
+ Debug|Win32 = Debug|Win32
+ Debug|x64 = Debug|x64
+ Release|Any CPU = Release|Any CPU
+ Release|Mixed Platforms = Release|Mixed Platforms
+ Release|Win32 = Release|Win32
+ Release|x64 = Release|x64
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Any CPU.ActiveCfg = Debug|x64
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Mixed Platforms.Build.0 = Debug|x64
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Win32.ActiveCfg = Debug|Win32
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Win32.Build.0 = Debug|Win32
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|x64.ActiveCfg = Debug|x64
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|x64.Build.0 = Debug|x64
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Any CPU.ActiveCfg = Release|x64
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Mixed Platforms.ActiveCfg = Release|x64
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Mixed Platforms.Build.0 = Release|x64
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Win32.ActiveCfg = Release|Win32
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Win32.Build.0 = Release|Win32
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|x64.ActiveCfg = Release|x64
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|x64.Build.0 = Release|x64
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Any CPU.ActiveCfg = Debug|x64
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Mixed Platforms.Build.0 = Debug|x64
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Win32.ActiveCfg = Debug|Win32
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Win32.Build.0 = Debug|Win32
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|x64.ActiveCfg = Debug|x64
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|x64.Build.0 = Debug|x64
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Any CPU.ActiveCfg = Release|x64
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Mixed Platforms.ActiveCfg = Release|x64
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Mixed Platforms.Build.0 = Release|x64
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Win32.ActiveCfg = Release|Win32
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Win32.Build.0 = Release|Win32
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|x64.ActiveCfg = Release|x64
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|x64.Build.0 = Release|x64
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Any CPU.ActiveCfg = Debug|x64
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Mixed Platforms.Build.0 = Debug|x64
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Win32.ActiveCfg = Debug|Win32
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Win32.Build.0 = Debug|Win32
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|x64.ActiveCfg = Debug|x64
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|x64.Build.0 = Debug|x64
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Any CPU.ActiveCfg = Release|x64
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Mixed Platforms.ActiveCfg = Release|x64
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Mixed Platforms.Build.0 = Release|x64
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Win32.ActiveCfg = Release|Win32
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Win32.Build.0 = Release|Win32
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|x64.ActiveCfg = Release|x64
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|x64.Build.0 = Release|x64
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Any CPU.ActiveCfg = Debug|x64
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Mixed Platforms.Build.0 = Debug|x64
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Win32.ActiveCfg = Debug|Win32
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Win32.Build.0 = Debug|Win32
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|x64.ActiveCfg = Debug|x64
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|x64.Build.0 = Debug|x64
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Any CPU.ActiveCfg = Release|x64
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Mixed Platforms.ActiveCfg = Release|x64
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Mixed Platforms.Build.0 = Release|x64
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Win32.ActiveCfg = Release|Win32
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Win32.Build.0 = Release|Win32
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|x64.ActiveCfg = Release|x64
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|x64.Build.0 = Release|x64
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|Any CPU.ActiveCfg = Debug|Win32
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|Mixed Platforms.Build.0 = Debug|Win32
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|Win32.ActiveCfg = Debug|Win32
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|Win32.Build.0 = Debug|Win32
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|x64.ActiveCfg = Debug|Win32
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|Any CPU.ActiveCfg = Release|Win32
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|Mixed Platforms.ActiveCfg = Release|Win32
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|Mixed Platforms.Build.0 = Release|Win32
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|Win32.ActiveCfg = Release|Win32
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|Win32.Build.0 = Release|Win32
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|x64.ActiveCfg = Release|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Any CPU.ActiveCfg = Debug|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Mixed Platforms.Build.0 = Debug|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Win32.ActiveCfg = Debug|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Win32.Build.0 = Debug|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|x64.ActiveCfg = Debug|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Any CPU.ActiveCfg = Release|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Mixed Platforms.ActiveCfg = Release|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Mixed Platforms.Build.0 = Release|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Win32.ActiveCfg = Release|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Win32.Build.0 = Release|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|x64.ActiveCfg = Release|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|Any CPU.ActiveCfg = Debug|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|Mixed Platforms.Build.0 = Debug|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|Win32.ActiveCfg = Debug|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|Win32.Build.0 = Debug|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|x64.ActiveCfg = Debug|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|Any CPU.ActiveCfg = Release|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|Mixed Platforms.ActiveCfg = Release|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|Mixed Platforms.Build.0 = Release|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|Win32.ActiveCfg = Release|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|Win32.Build.0 = Release|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|x64.ActiveCfg = Release|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|Any CPU.ActiveCfg = Debug|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|Mixed Platforms.Build.0 = Debug|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|Win32.ActiveCfg = Debug|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|Win32.Build.0 = Debug|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|x64.ActiveCfg = Debug|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|Any CPU.ActiveCfg = Release|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|Mixed Platforms.ActiveCfg = Release|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|Mixed Platforms.Build.0 = Release|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|Win32.ActiveCfg = Release|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|Win32.Build.0 = Release|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|x64.ActiveCfg = Release|Win32
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+ GlobalSection(NestedProjects) = preSolution
+ {2B262D59-9DC7-4BF1-A431-1BD4966899A5} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+ {2F760952-C71B-4865-998F-AABAE96D1373} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+ {4082881B-EB00-486F-906C-843B8EC06E18} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+ EndGlobalSection
+EndGlobal
diff --git a/src/mongo/db/dbcommands.cpp b/src/mongo/db/dbcommands.cpp
new file mode 100644
index 00000000000..570c897fae4
--- /dev/null
+++ b/src/mongo/db/dbcommands.cpp
@@ -0,0 +1,1955 @@
+// dbcommands.cpp
+
+/**
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* SHARDING:
+ I believe this file is for mongod only.
+ See s/commnands_public.cpp for mongos.
+*/
+
+#include "pch.h"
+#include "ops/count.h"
+#include "ops/query.h"
+#include "pdfile.h"
+#include "jsobj.h"
+#include "../bson/util/builder.h"
+#include <time.h>
+#include "introspect.h"
+#include "btree.h"
+#include "../util/lruishmap.h"
+#include "../util/md5.hpp"
+#include "../util/processinfo.h"
+#include "../util/ramlog.h"
+#include "json.h"
+#include "repl.h"
+#include "repl_block.h"
+#include "replutil.h"
+#include "commands.h"
+#include "db.h"
+#include "instance.h"
+#include "lasterror.h"
+#include "security.h"
+#include "queryoptimizer.h"
+#include "../scripting/engine.h"
+#include "stats/counters.h"
+#include "background.h"
+#include "../util/version.h"
+#include "../s/d_writeback.h"
+#include "dur_stats.h"
+
+namespace mongo {
+
+ namespace dur {
+ void setAgeOutJournalFiles(bool rotate);
+ }
+ /** @return true if fields found */
+ bool setParmsMongodSpecific(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+ BSONElement e = cmdObj["ageOutJournalFiles"];
+ if( !e.eoo() ) {
+ bool r = e.trueValue();
+ log() << "ageOutJournalFiles " << r << endl;
+ dur::setAgeOutJournalFiles(r);
+ return true;
+ }
+ return false;
+ }
+
+ /* reset any errors so that getlasterror comes back clean.
+
+ useful before performing a long series of operations where we want to
+ see if any of the operations triggered an error, but don't want to check
+ after each op as that woudl be a client/server turnaround.
+ */
+ class CmdResetError : public Command {
+ public:
+ virtual LockType locktype() const { return NONE; }
+ virtual bool logTheOp() {
+ return false;
+ }
+ virtual bool slaveOk() const {
+ return true;
+ }
+ virtual void help( stringstream& help ) const {
+ help << "reset error state (used with getpreverror)";
+ }
+ CmdResetError() : Command("resetError", false, "reseterror") {}
+ bool run(const string& db, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ LastError *le = lastError.get();
+ assert( le );
+ le->reset();
+ return true;
+ }
+ } cmdResetError;
+
+ /* set by replica sets if specified in the configuration.
+ a pointer is used to avoid any possible locking issues with lockless reading (see below locktype() is NONE
+ and would like to keep that)
+ (for now, it simply orphans any old copy as config changes should be extremely rare).
+ note: once non-null, never goes to null again.
+ */
+ BSONObj *getLastErrorDefault = 0;
+
+ class CmdGetLastError : public Command {
+ public:
+ CmdGetLastError() : Command("getLastError", false, "getlasterror") { }
+ virtual LockType locktype() const { return NONE; }
+ virtual bool logTheOp() { return false; }
+ virtual bool slaveOk() const { return true; }
+ virtual void help( stringstream& help ) const {
+ help << "return error status of the last operation on this connection\n"
+ << "options:\n"
+ << " { fsync:true } - fsync before returning, or wait for journal commit if running with --journal\n"
+ << " { j:true } - wait for journal commit if running with --journal\n"
+ << " { w:n } - await replication to n servers (including self) before returning\n"
+ << " { wtimeout:m} - timeout for w in m milliseconds";
+ }
+ bool run(const string& dbname, BSONObj& _cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ LastError *le = lastError.disableForCommand();
+
+ bool err = false;
+
+ if ( le->nPrev != 1 )
+ err = LastError::noError.appendSelf( result , false );
+ else
+ err = le->appendSelf( result , false );
+
+ Client& c = cc();
+ c.appendLastOp( result );
+
+ result.appendNumber( "connectionId" , c.getConnectionId() ); // for sharding; also useful in general for debugging
+
+ BSONObj cmdObj = _cmdObj;
+ {
+ BSONObj::iterator i(_cmdObj);
+ i.next();
+ if( !i.more() ) {
+ /* empty, use default */
+ BSONObj *def = getLastErrorDefault;
+ if( def )
+ cmdObj = *def;
+ }
+ }
+
+ if ( cmdObj["j"].trueValue() ) {
+ if( !getDur().awaitCommit() ) {
+ // --journal is off
+ result.append("jnote", "journaling not enabled on this server");
+ }
+ if( cmdObj["fsync"].trueValue() ) {
+ errmsg = "fsync and j options are not used together";
+ return false;
+ }
+ }
+ else if ( cmdObj["fsync"].trueValue() ) {
+ Timer t;
+ if( !getDur().awaitCommit() ) {
+ // if get here, not running with --journal
+ log() << "fsync from getlasterror" << endl;
+ result.append( "fsyncFiles" , MemoryMappedFile::flushAll( true ) );
+ }
+ else {
+ // this perhaps is temp. how long we wait for the group commit to occur.
+ result.append( "waited", t.millis() );
+ }
+ }
+
+ if ( err ) {
+ // doesn't make sense to wait for replication
+ // if there was an error
+ return true;
+ }
+
+ BSONElement e = cmdObj["w"];
+ if ( e.ok() ) {
+ int timeout = cmdObj["wtimeout"].numberInt();
+ Timer t;
+
+ long long passes = 0;
+ char buf[32];
+ while ( 1 ) {
+ OpTime op(c.getLastOp());
+
+ if ( op.isNull() ) {
+ if ( anyReplEnabled() ) {
+ result.append( "wnote" , "no write has been done on this connection" );
+ }
+ else if ( e.isNumber() && e.numberInt() <= 1 ) {
+ // don't do anything
+ // w=1 and no repl, so this is fine
+ }
+ else {
+ // w=2 and no repl
+ result.append( "wnote" , "no replication has been enabled, so w=2+ won't work" );
+ result.append( "err", "norepl" );
+ return true;
+ }
+ break;
+ }
+
+ // check this first for w=0 or w=1
+ if ( opReplicatedEnough( op, e ) ) {
+ break;
+ }
+
+ // if replication isn't enabled (e.g., config servers)
+ if ( ! anyReplEnabled() ) {
+ result.append( "err", "norepl" );
+ return true;
+ }
+
+
+ if ( timeout > 0 && t.millis() >= timeout ) {
+ result.append( "wtimeout" , true );
+ errmsg = "timed out waiting for slaves";
+ result.append( "waited" , t.millis() );
+ result.append( "err" , "timeout" );
+ return true;
+ }
+
+ assert( sprintf( buf , "w block pass: %lld" , ++passes ) < 30 );
+ c.curop()->setMessage( buf );
+ sleepmillis(1);
+ killCurrentOp.checkForInterrupt();
+ }
+ result.appendNumber( "wtime" , t.millis() );
+ }
+
+ result.appendNull( "err" );
+ return true;
+ }
+ } cmdGetLastError;
+
+ class CmdGetPrevError : public Command {
+ public:
+ virtual LockType locktype() const { return NONE; }
+ virtual bool logTheOp() {
+ return false;
+ }
+ virtual void help( stringstream& help ) const {
+ help << "check for errors since last reseterror commandcal";
+ }
+ virtual bool slaveOk() const {
+ return true;
+ }
+ CmdGetPrevError() : Command("getPrevError", false, "getpreverror") {}
+ bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ LastError *le = lastError.disableForCommand();
+ le->appendSelf( result );
+ if ( le->valid )
+ result.append( "nPrev", le->nPrev );
+ else
+ result.append( "nPrev", -1 );
+ return true;
+ }
+ } cmdGetPrevError;
+
+ CmdShutdown cmdShutdown;
+
+ void CmdShutdown::help( stringstream& help ) const {
+ help << "shutdown the database. must be ran against admin db and "
+ << "either (1) ran from localhost or (2) authenticated. If "
+ << "this is a primary in a replica set and there is no member "
+ << "within 10 seconds of its optime, it will not shutdown "
+ << "without force : true. You can also specify timeoutSecs : "
+ << "N to wait N seconds for other members to catch up.";
+ }
+
+ bool CmdShutdown::run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ bool force = cmdObj.hasField("force") && cmdObj["force"].trueValue();
+
+ if (!force && theReplSet && theReplSet->isPrimary()) {
+ long long timeout, now, start;
+ timeout = now = start = curTimeMicros64()/1000000;
+ if (cmdObj.hasField("timeoutSecs")) {
+ timeout += cmdObj["timeoutSecs"].numberLong();
+ }
+
+ OpTime lastOp = theReplSet->lastOpTimeWritten;
+ OpTime closest = theReplSet->lastOtherOpTime();
+ long long int diff = lastOp.getSecs() - closest.getSecs();
+ while (now <= timeout && (diff < 0 || diff > 10)) {
+ sleepsecs(1);
+ now++;
+
+ lastOp = theReplSet->lastOpTimeWritten;
+ closest = theReplSet->lastOtherOpTime();
+ diff = lastOp.getSecs() - closest.getSecs();
+ }
+
+ if (diff < 0 || diff > 10) {
+ errmsg = "no secondaries within 10 seconds of my optime";
+ result.append("closest", closest.getSecs());
+ result.append("difference", diff);
+ return false;
+ }
+
+ // step down
+ theReplSet->stepDown(120);
+
+ log() << "waiting for secondaries to catch up" << endl;
+
+ lastOp = theReplSet->lastOpTimeWritten;
+ while (lastOp != closest && now - start < 60) {
+ closest = theReplSet->lastOtherOpTime();
+
+ now++;
+ sleepsecs(1);
+ }
+
+ // regardless of whether they caught up, we'll shut down
+ }
+
+ return shutdownHelper();
+ }
+
+ class CmdDropDatabase : public Command {
+ public:
+ virtual bool logTheOp() {
+ return true;
+ }
+ virtual void help( stringstream& help ) const {
+ help << "drop (delete) this database";
+ }
+ virtual bool slaveOk() const {
+ return false;
+ }
+ virtual LockType locktype() const { return WRITE; }
+ CmdDropDatabase() : Command("dropDatabase") {}
+ bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ BSONElement e = cmdObj.firstElement();
+ log() << "dropDatabase " << dbname << endl;
+ int p = (int) e.number();
+ if ( p != 1 )
+ return false;
+ dropDatabase(dbname);
+ result.append( "dropped" , dbname );
+ return true;
+ }
+ } cmdDropDatabase;
+
+ class CmdRepairDatabase : public Command {
+ public:
+ virtual bool logTheOp() {
+ return false;
+ }
+ virtual bool slaveOk() const {
+ return true;
+ }
+ virtual bool maintenanceMode() const { return true; }
+ virtual void help( stringstream& help ) const {
+ help << "repair database. also compacts. note: slow.";
+ }
+ virtual LockType locktype() const { return WRITE; }
+ CmdRepairDatabase() : Command("repairDatabase") {}
+ bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ BSONElement e = cmdObj.firstElement();
+ log() << "repairDatabase " << dbname << endl;
+ int p = (int) e.number();
+ if ( p != 1 ) {
+ errmsg = "bad option";
+ return false;
+ }
+ e = cmdObj.getField( "preserveClonedFilesOnFailure" );
+ bool preserveClonedFilesOnFailure = e.isBoolean() && e.boolean();
+ e = cmdObj.getField( "backupOriginalFiles" );
+ bool backupOriginalFiles = e.isBoolean() && e.boolean();
+ return repairDatabase( dbname, errmsg, preserveClonedFilesOnFailure, backupOriginalFiles );
+ }
+ } cmdRepairDatabase;
+
+ /* set db profiling level
+ todo: how do we handle profiling information put in the db with replication?
+ sensibly or not?
+ */
+ class CmdProfile : public Command {
+ public:
+ virtual bool slaveOk() const {
+ return true;
+ }
+ virtual void help( stringstream& help ) const {
+ help << "enable or disable performance profiling\n";
+ help << "{ profile : <n> }\n";
+ help << "0=off 1=log slow ops 2=log all\n";
+ help << "-1 to get current values\n";
+ help << "http://www.mongodb.org/display/DOCS/Database+Profiler";
+ }
+ virtual LockType locktype() const { return WRITE; }
+ CmdProfile() : Command("profile") {}
+ bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ BSONElement e = cmdObj.firstElement();
+ result.append("was", cc().database()->profile);
+ result.append("slowms", cmdLine.slowMS );
+
+ int p = (int) e.number();
+ bool ok = false;
+
+ if ( p == -1 )
+ ok = true;
+ else if ( p >= 0 && p <= 2 ) {
+ ok = cc().database()->setProfilingLevel( p , errmsg );
+ }
+
+ BSONElement slow = cmdObj["slowms"];
+ if ( slow.isNumber() )
+ cmdLine.slowMS = slow.numberInt();
+
+ return ok;
+ }
+ } cmdProfile;
+
+ class CmdServerStatus : public Command {
+ public:
+ virtual bool slaveOk() const {
+ return true;
+ }
+ CmdServerStatus() : Command("serverStatus", true) {}
+
+ virtual LockType locktype() const { return NONE; }
+
+ virtual void help( stringstream& help ) const {
+ help << "returns lots of administrative server statistics";
+ }
+
+ bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ long long start = Listener::getElapsedTimeMillis();
+ BSONObjBuilder timeBuilder(128);
+
+
+ bool authed = cc().getAuthenticationInfo()->isAuthorizedReads("admin");
+
+ result.append( "host" , prettyHostName() );
+ result.append("version", versionString);
+ result.append("process","mongod");
+ result.append("uptime",(double) (time(0)-cmdLine.started));
+ result.append("uptimeEstimate",(double) (start/1000));
+ result.appendDate( "localTime" , jsTime() );
+
+ {
+ BSONObjBuilder t;
+
+ unsigned long long last, start, timeLocked;
+ d.dbMutex.info().getTimingInfo(start, timeLocked);
+ last = curTimeMicros64();
+ double tt = (double) last-start;
+ double tl = (double) timeLocked;
+ t.append("totalTime", tt);
+ t.append("lockTime", tl);
+ t.append("ratio", (tt ? tl/tt : 0));
+
+ {
+ BSONObjBuilder ttt( t.subobjStart( "currentQueue" ) );
+ int w=0, r=0;
+ Client::recommendedYieldMicros( &w , &r );
+ ttt.append( "total" , w + r );
+ ttt.append( "readers" , r );
+ ttt.append( "writers" , w );
+ ttt.done();
+ }
+
+ {
+ BSONObjBuilder ttt( t.subobjStart( "activeClients" ) );
+ int w=0, r=0;
+ Client::getActiveClientCount( w , r );
+ ttt.append( "total" , w + r );
+ ttt.append( "readers" , r );
+ ttt.append( "writers" , w );
+ ttt.done();
+ }
+
+
+
+ result.append( "globalLock" , t.obj() );
+ }
+ timeBuilder.appendNumber( "after basic" , Listener::getElapsedTimeMillis() - start );
+
+ {
+
+ BSONObjBuilder t( result.subobjStart( "mem" ) );
+
+ t.append("bits", ( sizeof(int*) == 4 ? 32 : 64 ) );
+
+ ProcessInfo p;
+ int v = 0;
+ if ( p.supported() ) {
+ t.appendNumber( "resident" , p.getResidentSize() );
+ v = p.getVirtualMemorySize();
+ t.appendNumber( "virtual" , v );
+ t.appendBool( "supported" , true );
+ }
+ else {
+ result.append( "note" , "not all mem info support on this platform" );
+ t.appendBool( "supported" , false );
+ }
+
+ timeBuilder.appendNumber( "middle of mem" , Listener::getElapsedTimeMillis() - start );
+
+ int m = (int) (MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ));
+ t.appendNumber( "mapped" , m );
+
+ if ( cmdLine.dur ) {
+ m *= 2;
+ t.appendNumber( "mappedWithJournal" , m );
+ }
+
+ int overhead = v - m - connTicketHolder.used();
+
+ if( overhead > 4000 ) {
+ t.append("note", "virtual minus mapped is large. could indicate a memory leak");
+ log() << "warning: virtual size (" << v << "MB) - mapped size (" << m << "MB) is large (" << overhead << "MB). could indicate a memory leak" << endl;
+ }
+
+ t.done();
+
+ }
+ timeBuilder.appendNumber( "after mem" , Listener::getElapsedTimeMillis() - start );
+
+ {
+ BSONObjBuilder bb( result.subobjStart( "connections" ) );
+ bb.append( "current" , connTicketHolder.used() );
+ bb.append( "available" , connTicketHolder.available() );
+ bb.done();
+ }
+ timeBuilder.appendNumber( "after connections" , Listener::getElapsedTimeMillis() - start );
+
+ {
+ BSONObjBuilder bb( result.subobjStart( "extra_info" ) );
+ bb.append("note", "fields vary by platform");
+ ProcessInfo p;
+ p.getExtraInfo(bb);
+ bb.done();
+ timeBuilder.appendNumber( "after extra info" , Listener::getElapsedTimeMillis() - start );
+
+ }
+
+ {
+ BSONObjBuilder bb( result.subobjStart( "indexCounters" ) );
+ globalIndexCounters.append( bb );
+ bb.done();
+ }
+
+ {
+ BSONObjBuilder bb( result.subobjStart( "backgroundFlushing" ) );
+ globalFlushCounters.append( bb );
+ bb.done();
+ }
+
+ {
+ BSONObjBuilder bb( result.subobjStart( "cursors" ) );
+ ClientCursor::appendStats( bb );
+ bb.done();
+ }
+
+ {
+ BSONObjBuilder bb( result.subobjStart( "network" ) );
+ networkCounter.append( bb );
+ bb.done();
+ }
+
+
+ timeBuilder.appendNumber( "after counters" , Listener::getElapsedTimeMillis() - start );
+
+ if ( anyReplEnabled() ) {
+ BSONObjBuilder bb( result.subobjStart( "repl" ) );
+ appendReplicationInfo( bb , authed , cmdObj["repl"].numberInt() );
+ bb.done();
+
+ if ( ! _isMaster() ) {
+ result.append( "opcountersRepl" , replOpCounters.getObj() );
+ }
+
+ }
+
+ timeBuilder.appendNumber( "after repl" , Listener::getElapsedTimeMillis() - start );
+
+ result.append( "opcounters" , globalOpCounters.getObj() );
+
+ {
+ BSONObjBuilder asserts( result.subobjStart( "asserts" ) );
+ asserts.append( "regular" , assertionCount.regular );
+ asserts.append( "warning" , assertionCount.warning );
+ asserts.append( "msg" , assertionCount.msg );
+ asserts.append( "user" , assertionCount.user );
+ asserts.append( "rollovers" , assertionCount.rollovers );
+ asserts.done();
+ }
+
+ timeBuilder.appendNumber( "after asserts" , Listener::getElapsedTimeMillis() - start );
+
+ result.append( "writeBacksQueued" , ! writeBackManager.queuesEmpty() );
+
+ if( cmdLine.dur ) {
+ result.append("dur", dur::stats.asObj());
+ }
+
+ timeBuilder.appendNumber( "after dur" , Listener::getElapsedTimeMillis() - start );
+
+ {
+ RamLog* rl = RamLog::get( "warnings" );
+ verify(15880, rl);
+
+ if (rl->lastWrite() >= time(0)-(10*60)){ // only show warnings from last 10 minutes
+ vector<const char*> lines;
+ rl->get( lines );
+
+ BSONArrayBuilder arr( result.subarrayStart( "warnings" ) );
+ for ( unsigned i=std::max(0,(int)lines.size()-10); i<lines.size(); i++ )
+ arr.append( lines[i] );
+ arr.done();
+ }
+ }
+
+ if ( ! authed )
+ result.append( "note" , "run against admin for more info" );
+
+ timeBuilder.appendNumber( "at end" , Listener::getElapsedTimeMillis() - start );
+ if ( Listener::getElapsedTimeMillis() - start > 1000 ) {
+ BSONObj t = timeBuilder.obj();
+ log() << "serverStatus was very slow: " << t << endl;
+ result.append( "timing" , t );
+ }
+
+ return true;
+ }
+ } cmdServerStatus;
+
+ class CmdGetOpTime : public Command {
+ public:
+ virtual bool slaveOk() const {
+ return true;
+ }
+ virtual void help( stringstream& help ) const { help << "internal"; }
+ virtual LockType locktype() const { return NONE; }
+ CmdGetOpTime() : Command("getoptime") { }
+ bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ writelock l( "" );
+ result.appendDate("optime", OpTime::now().asDate());
+ return true;
+ }
+ } cmdgetoptime;
+
+ /*
+ class Cmd : public Command {
+ public:
+ Cmd() : Command("") { }
+ bool adminOnly() const { return true; }
+ bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result) {
+ return true;
+ }
+ } cmd;
+ */
+
+ class CmdDiagLogging : public Command {
+ public:
+ virtual bool slaveOk() const {
+ return true;
+ }
+ CmdDiagLogging() : Command("diagLogging") { }
+ bool adminOnly() const {
+ return true;
+ }
+ void help(stringstream& h) const { h << "http://www.mongodb.org/display/DOCS/Monitoring+and+Diagnostics#MonitoringandDiagnostics-DatabaseRecord%2FReplay"; }
+ virtual LockType locktype() const { return WRITE; }
+ bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+ int was = _diaglog.setLevel( cmdObj.firstElement().numberInt() );
+ _diaglog.flush();
+ if ( !cmdLine.quiet )
+ tlog() << "CMD: diagLogging set to " << _diaglog.getLevel() << " from: " << was << endl;
+ result.append( "was" , was );
+ return true;
+ }
+ } cmddiaglogging;
+
+ /* remove bit from a bit array - actually remove its slot, not a clear
+ note: this function does not work with x == 63 -- that is ok
+ but keep in mind in the future if max indexes were extended to
+ exactly 64 it would be a problem
+ */
+ unsigned long long removeBit(unsigned long long b, int x) {
+ unsigned long long tmp = b;
+ return
+ (tmp & ((((unsigned long long) 1) << x)-1)) |
+ ((tmp >> (x+1)) << x);
+ }
+
+ struct DBCommandsUnitTest {
+ DBCommandsUnitTest() {
+ assert( removeBit(1, 0) == 0 );
+ assert( removeBit(2, 0) == 1 );
+ assert( removeBit(2, 1) == 0 );
+ assert( removeBit(255, 1) == 127 );
+ assert( removeBit(21, 2) == 9 );
+ assert( removeBit(0x4000000000000001ULL, 62) == 1 );
+ }
+ } dbc_unittest;
+
+ void assureSysIndexesEmptied(const char *ns, IndexDetails *exceptForIdIndex);
+ int removeFromSysIndexes(const char *ns, const char *idxName);
+
+ bool dropIndexes( NamespaceDetails *d, const char *ns, const char *name, string &errmsg, BSONObjBuilder &anObjBuilder, bool mayDeleteIdIndex ) {
+
+ BackgroundOperation::assertNoBgOpInProgForNs(ns);
+
+ d = d->writingWithExtra();
+ d->aboutToDeleteAnIndex();
+
+ /* there may be pointers pointing at keys in the btree(s). kill them. */
+ ClientCursor::invalidate(ns);
+
+ // delete a specific index or all?
+ if ( *name == '*' && name[1] == 0 ) {
+ log(4) << " d->nIndexes was " << d->nIndexes << '\n';
+ anObjBuilder.append("nIndexesWas", (double)d->nIndexes);
+ IndexDetails *idIndex = 0;
+ if( d->nIndexes ) {
+ for ( int i = 0; i < d->nIndexes; i++ ) {
+ if ( !mayDeleteIdIndex && d->idx(i).isIdIndex() ) {
+ idIndex = &d->idx(i);
+ }
+ else {
+ d->idx(i).kill_idx();
+ }
+ }
+ d->nIndexes = 0;
+ }
+ if ( idIndex ) {
+ d->addIndex(ns) = *idIndex;
+ wassert( d->nIndexes == 1 );
+ }
+ /* assuming here that id index is not multikey: */
+ d->multiKeyIndexBits = 0;
+ assureSysIndexesEmptied(ns, idIndex);
+ anObjBuilder.append("msg", mayDeleteIdIndex ?
+ "indexes dropped for collection" :
+ "non-_id indexes dropped for collection");
+ }
+ else {
+ // delete just one index
+ int x = d->findIndexByName(name);
+ if ( x >= 0 ) {
+ log(4) << " d->nIndexes was " << d->nIndexes << endl;
+ anObjBuilder.append("nIndexesWas", (double)d->nIndexes);
+
+ /* note it is important we remove the IndexDetails with this
+ call, otherwise, on recreate, the old one would be reused, and its
+ IndexDetails::info ptr would be bad info.
+ */
+ IndexDetails *id = &d->idx(x);
+ if ( !mayDeleteIdIndex && id->isIdIndex() ) {
+ errmsg = "may not delete _id index";
+ return false;
+ }
+ id->kill_idx();
+ d->multiKeyIndexBits = removeBit(d->multiKeyIndexBits, x);
+ d->nIndexes--;
+ for ( int i = x; i < d->nIndexes; i++ )
+ d->idx(i) = d->idx(i+1);
+ }
+ else {
+ int n = removeFromSysIndexes(ns, name); // just in case an orphaned listing there - i.e. should have been repaired but wasn't
+ if( n ) {
+ log() << "info: removeFromSysIndexes cleaned up " << n << " entries" << endl;
+ }
+ log() << "dropIndexes: " << name << " not found" << endl;
+ errmsg = "index not found";
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /* drop collection */
+ class CmdDrop : public Command {
+ public:
+ CmdDrop() : Command("drop") { }
+ virtual bool logTheOp() {
+ return true;
+ }
+ virtual bool slaveOk() const {
+ return false;
+ }
+ virtual bool adminOnly() const {
+ return false;
+ }
+ virtual void help( stringstream& help ) const { help << "drop a collection\n{drop : <collectionName>}"; }
+ virtual LockType locktype() const { return WRITE; }
+ virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+ string nsToDrop = dbname + '.' + cmdObj.firstElement().valuestr();
+ NamespaceDetails *d = nsdetails(nsToDrop.c_str());
+ if ( !cmdLine.quiet )
+ tlog() << "CMD: drop " << nsToDrop << endl;
+ if ( d == 0 ) {
+ errmsg = "ns not found";
+ return false;
+ }
+ uassert( 10039 , "can't drop collection with reserved $ character in name", strchr(nsToDrop.c_str(), '$') == 0 );
+ dropCollection( nsToDrop, errmsg, result );
+ return true;
+ }
+ } cmdDrop;
+
+ /* select count(*) */
+ class CmdCount : public Command {
+ public:
+ virtual LockType locktype() const { return READ; }
+ CmdCount() : Command("count") { }
+ virtual bool logTheOp() { return false; }
+ virtual bool slaveOk() const {
+ // ok on --slave setups
+ return replSettings.slave == SimpleSlave;
+ }
+ virtual bool slaveOverrideOk() { return true; }
+ virtual bool maintenanceOk() const { return false; }
+ virtual bool adminOnly() const { return false; }
+ virtual void help( stringstream& help ) const { help << "count objects in collection"; }
+ virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+ string ns = parseNs(dbname, cmdObj);
+ string err;
+ long long n = runCount(ns.c_str(), cmdObj, err);
+ long long nn = n;
+ bool ok = true;
+ if ( n == -1 ) {
+ nn = 0;
+ result.appendBool( "missing" , true );
+ }
+ else if ( n < 0 ) {
+ nn = 0;
+ ok = false;
+ if ( !err.empty() )
+ errmsg = err;
+ }
+ result.append("n", (double) nn);
+ return ok;
+ }
+ } cmdCount;
+
+ /* create collection */
+ class CmdCreate : public Command {
+ public:
+ CmdCreate() : Command("create") { }
+ virtual bool logTheOp() {
+ return false;
+ }
+ virtual bool slaveOk() const {
+ return false;
+ }
+ virtual bool adminOnly() const {
+ return false;
+ }
+ virtual LockType locktype() const { return WRITE; }
+ virtual void help( stringstream& help ) const {
+ help << "create a collection explicitly\n"
+ "{ create: <ns>[, capped: <bool>, size: <collSizeInBytes>, max: <nDocs>] }";
+ }
+ virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+ uassert(15888, "must pass name of collection to create", cmdObj.firstElement().valuestrsafe()[0] != '\0');
+ string ns = dbname + '.' + cmdObj.firstElement().valuestr();
+ string err;
+ uassert(14832, "specify size:<n> when capped is true", !cmdObj["capped"].trueValue() || cmdObj["size"].isNumber() || cmdObj.hasField("$nExtents"));
+ bool ok = userCreateNS(ns.c_str(), cmdObj, err, ! fromRepl );
+ if ( !ok && !err.empty() )
+ errmsg = err;
+ return ok;
+ }
+ } cmdCreate;
+
+ /* "dropIndexes" is now the preferred form - "deleteIndexes" deprecated */
+ class CmdDropIndexes : public Command {
+ public:
+ virtual bool logTheOp() {
+ return true;
+ }
+ virtual bool slaveOk() const {
+ return false;
+ }
+ virtual LockType locktype() const { return WRITE; }
+ virtual void help( stringstream& help ) const {
+ help << "drop indexes for a collection";
+ }
+ CmdDropIndexes() : Command("dropIndexes", false, "deleteIndexes") { }
+ bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& anObjBuilder, bool /*fromRepl*/) {
+ BSONElement e = jsobj.firstElement();
+ string toDeleteNs = dbname + '.' + e.valuestr();
+ NamespaceDetails *d = nsdetails(toDeleteNs.c_str());
+ if ( !cmdLine.quiet )
+ tlog() << "CMD: dropIndexes " << toDeleteNs << endl;
+ if ( d ) {
+ BSONElement f = jsobj.getField("index");
+ if ( f.type() == String ) {
+ return dropIndexes( d, toDeleteNs.c_str(), f.valuestr(), errmsg, anObjBuilder, false );
+ }
+ else if ( f.type() == Object ) {
+ int idxId = d->findIndexByKeyPattern( f.embeddedObject() );
+ if ( idxId < 0 ) {
+ errmsg = "can't find index with key:";
+ errmsg += f.embeddedObject().toString();
+ return false;
+ }
+ else {
+ IndexDetails& ii = d->idx( idxId );
+ string iName = ii.indexName();
+ return dropIndexes( d, toDeleteNs.c_str(), iName.c_str() , errmsg, anObjBuilder, false );
+ }
+ }
+ else {
+ errmsg = "invalid index name spec";
+ return false;
+ }
+ }
+ else {
+ errmsg = "ns not found";
+ return false;
+ }
+ }
+ } cmdDropIndexes;
+
+ class CmdReIndex : public Command {
+ public:
+ virtual bool logTheOp() { return false; } // only reindexes on the one node
+ virtual bool slaveOk() const { return true; } // can reindex on a secondary
+ virtual LockType locktype() const { return WRITE; }
+ virtual void help( stringstream& help ) const {
+ help << "re-index a collection";
+ }
+ CmdReIndex() : Command("reIndex") { }
+ bool run(const string& dbname , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+ static DBDirectClient db;
+
+ BSONElement e = jsobj.firstElement();
+ string toDeleteNs = dbname + '.' + e.valuestr();
+ NamespaceDetails *d = nsdetails(toDeleteNs.c_str());
+ tlog() << "CMD: reIndex " << toDeleteNs << endl;
+ BackgroundOperation::assertNoBgOpInProgForNs(toDeleteNs.c_str());
+
+ if ( ! d ) {
+ errmsg = "ns not found";
+ return false;
+ }
+
+ list<BSONObj> all;
+ auto_ptr<DBClientCursor> i = db.query( dbname + ".system.indexes" , BSON( "ns" << toDeleteNs ) , 0 , 0 , 0 , QueryOption_SlaveOk );
+ BSONObjBuilder b;
+ while ( i->more() ) {
+ BSONObj o = i->next().removeField("v").getOwned();
+ b.append( BSONObjBuilder::numStr( all.size() ) , o );
+ all.push_back( o );
+ }
+
+
+ bool ok = dropIndexes( d, toDeleteNs.c_str(), "*" , errmsg, result, true );
+ if ( ! ok ) {
+ errmsg = "dropIndexes failed";
+ return false;
+ }
+
+ for ( list<BSONObj>::iterator i=all.begin(); i!=all.end(); i++ ) {
+ BSONObj o = *i;
+ log(1) << "reIndex ns: " << toDeleteNs << " index: " << o << endl;
+ theDataFileMgr.insertWithObjMod( Namespace( toDeleteNs.c_str() ).getSisterNS( "system.indexes" ).c_str() , o , true );
+ }
+
+ result.append( "nIndexes" , (int)all.size() );
+ result.appendArray( "indexes" , b.obj() );
+ return true;
+ }
+ } cmdReIndex;
+
+ class CmdListDatabases : public Command {
+ public:
+ virtual bool slaveOk() const {
+ return true;
+ }
+ virtual bool slaveOverrideOk() {
+ return true;
+ }
+ virtual bool adminOnly() const {
+ return true;
+ }
+ virtual LockType locktype() const { return NONE; }
+ virtual void help( stringstream& help ) const { help << "list databases on this server"; }
+ CmdListDatabases() : Command("listDatabases" , true ) {}
+ bool run(const string& dbname , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+ vector< string > dbNames;
+ getDatabaseNames( dbNames );
+ vector< BSONObj > dbInfos;
+
+ set<string> seen;
+ boost::intmax_t totalSize = 0;
+ for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) {
+ BSONObjBuilder b;
+ b.append( "name", *i );
+
+ boost::intmax_t size = dbSize( i->c_str() );
+ b.append( "sizeOnDisk", (double) size );
+ totalSize += size;
+
+ {
+ Client::ReadContext rc( *i + ".system.namespaces" );
+ b.appendBool( "empty", rc.ctx().db()->isEmpty() );
+ }
+
+ dbInfos.push_back( b.obj() );
+
+ seen.insert( i->c_str() );
+ }
+
+ // TODO: erh 1/1/2010 I think this is broken where path != dbpath ??
+ set<string> allShortNames;
+ {
+ readlock lk;
+ dbHolder().getAllShortNames( false, allShortNames );
+ }
+
+ for ( set<string>::iterator i = allShortNames.begin(); i != allShortNames.end(); i++ ) {
+ string name = *i;
+
+ if ( seen.count( name ) )
+ continue;
+
+ BSONObjBuilder b;
+ b.append( "name" , name );
+ b.append( "sizeOnDisk" , (double)1.0 );
+
+ {
+ readlock lk( name );
+ Client::Context ctx( name );
+ b.appendBool( "empty", ctx.db()->isEmpty() );
+ }
+
+ dbInfos.push_back( b.obj() );
+ }
+
+ result.append( "databases", dbInfos );
+ result.append( "totalSize", double( totalSize ) );
+ return true;
+ }
+ } cmdListDatabases;
+
+ /* note an access to a database right after this will open it back up - so this is mainly
+ for diagnostic purposes.
+ */
+ class CmdCloseAllDatabases : public Command {
+ public:
+ virtual void help( stringstream& help ) const { help << "Close all database files.\nA new request will cause an immediate reopening; thus, this is mostly for testing purposes."; }
+ virtual bool adminOnly() const { return true; }
+ virtual bool slaveOk() const { return false; }
+ virtual LockType locktype() const { return WRITE; }
+
+ CmdCloseAllDatabases() : Command( "closeAllDatabases" ) {}
+ bool run(const string& dbname , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+ bool ok;
+ try {
+ ok = dbHolderW().closeAll( dbpath , result, false );
+ }
+ catch(DBException&) {
+ throw;
+ }
+ catch(...) {
+ log() << "ERROR uncaught exception in command closeAllDatabases" << endl;
+ errmsg = "unexpected uncaught exception";
+ return false;
+ }
+ return ok;
+ }
+ } cmdCloseAllDatabases;
+
+ class CmdFileMD5 : public Command {
+ public:
+ CmdFileMD5() : Command( "filemd5" ) {}
+ virtual bool slaveOk() const {
+ return true;
+ }
+ virtual void help( stringstream& help ) const {
+ help << " example: { filemd5 : ObjectId(aaaaaaa) , root : \"fs\" }";
+ }
+ virtual LockType locktype() const { return READ; }
+ bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+ string ns = dbname;
+ ns += ".";
+ {
+ string root = jsobj.getStringField( "root" );
+ if ( root.size() == 0 )
+ root = "fs";
+ ns += root;
+ }
+ ns += ".chunks"; // make this an option in jsobj
+
+ md5digest d;
+ md5_state_t st;
+ md5_init(&st);
+
+ BSONObj query = BSON( "files_id" << jsobj["filemd5"] );
+ BSONObj sort = BSON( "files_id" << 1 << "n" << 1 );
+
+ shared_ptr<Cursor> cursor = bestGuessCursor(ns.c_str(), query, sort);
+ if ( ! cursor ) {
+ errmsg = "need an index on { files_id : 1 , n : 1 }";
+ return false;
+ }
+ auto_ptr<ClientCursor> cc (new ClientCursor(QueryOption_NoCursorTimeout, cursor, ns.c_str()));
+
+ int n = 0;
+ while ( cursor->ok() ) {
+ if ( ! cursor->matcher()->matchesCurrent( cursor.get() ) ) {
+ log() << "**** NOT MATCHING ****" << endl;
+ PRINT(cursor->current());
+ cursor->advance();
+ continue;
+ }
+
+ BSONObj obj = cursor->current();
+ cursor->advance();
+
+ BSONElement ne = obj["n"];
+ assert(ne.isNumber());
+ int myn = ne.numberInt();
+ if ( n != myn ) {
+ log() << "should have chunk: " << n << " have:" << myn << endl;
+ dumpChunks( ns , query , sort );
+ uassert( 10040 , "chunks out of order" , n == myn );
+ }
+
+ int len;
+ const char * data = obj["data"].binDataClean( len );
+
+ ClientCursor::YieldLock yield (cc.get());
+ try {
+ md5_append( &st , (const md5_byte_t*)(data) , len );
+ n++;
+ }
+ catch (...) {
+ if ( ! yield.stillOk() ) // relocks
+ cc.release();
+ throw;
+ }
+
+ if ( ! yield.stillOk() ) {
+ cc.release();
+ uasserted(13281, "File deleted during filemd5 command");
+ }
+ }
+
+ md5_finish(&st, d);
+
+ result.append( "numChunks" , n );
+ result.append( "md5" , digestToString( d ) );
+ return true;
+ }
+
+ void dumpChunks( const string& ns , const BSONObj& query , const BSONObj& sort ) {
+ DBDirectClient client;
+ Query q(query);
+ q.sort(sort);
+ auto_ptr<DBClientCursor> c = client.query(ns, q);
+ while(c->more())
+ PRINT(c->nextSafe());
+ }
+ } cmdFileMD5;
+
+ static IndexDetails *cmdIndexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern ) {
+ if ( ns[ 0 ] == '\0' || min.isEmpty() || max.isEmpty() ) {
+ errmsg = "invalid command syntax (note: min and max are required)";
+ return 0;
+ }
+ return indexDetailsForRange( ns, errmsg, min, max, keyPattern );
+ }
+
+ class CmdDatasize : public Command {
+ virtual string parseNs(const string& dbname, const BSONObj& cmdObj) const {
+ return parseNsFullyQualified(dbname, cmdObj);
+ }
+ public:
+ CmdDatasize() : Command( "dataSize", false, "datasize" ) {}
+ virtual bool slaveOk() const { return true; }
+ virtual LockType locktype() const { return READ; }
+ virtual void help( stringstream &help ) const {
+ help <<
+ "determine data size for a set of data in a certain range"
+ "\nexample: { dataSize:\"blog.posts\", keyPattern:{x:1}, min:{x:10}, max:{x:55} }"
+ "\nkeyPattern, min, and max parameters are optional."
+ "\nnote: This command may take a while to run";
+ }
+ bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+ Timer timer;
+
+ string ns = jsobj.firstElement().String();
+ BSONObj min = jsobj.getObjectField( "min" );
+ BSONObj max = jsobj.getObjectField( "max" );
+ BSONObj keyPattern = jsobj.getObjectField( "keyPattern" );
+ bool estimate = jsobj["estimate"].trueValue();
+
+ Client::Context ctx( ns );
+ NamespaceDetails *d = nsdetails(ns.c_str());
+
+ if ( ! d || d->stats.nrecords == 0 ) {
+ result.appendNumber( "size" , 0 );
+ result.appendNumber( "numObjects" , 0 );
+ result.append( "millis" , timer.millis() );
+ return true;
+ }
+
+ result.appendBool( "estimate" , estimate );
+
+ shared_ptr<Cursor> c;
+ if ( min.isEmpty() && max.isEmpty() ) {
+ if ( estimate ) {
+ result.appendNumber( "size" , d->stats.datasize );
+ result.appendNumber( "numObjects" , d->stats.nrecords );
+ result.append( "millis" , timer.millis() );
+ return 1;
+ }
+ c = theDataFileMgr.findAll( ns.c_str() );
+ }
+ else if ( min.isEmpty() || max.isEmpty() ) {
+ errmsg = "only one of min or max specified";
+ return false;
+ }
+ else {
+ IndexDetails *idx = cmdIndexDetailsForRange( ns.c_str(), errmsg, min, max, keyPattern );
+ if ( idx == 0 )
+ return false;
+
+ c.reset( BtreeCursor::make( d, d->idxNo(*idx), *idx, min, max, false, 1 ) );
+ }
+
+ long long avgObjSize = d->stats.datasize / d->stats.nrecords;
+
+ long long maxSize = jsobj["maxSize"].numberLong();
+ long long maxObjects = jsobj["maxObjects"].numberLong();
+
+ long long size = 0;
+ long long numObjects = 0;
+ while( c->ok() ) {
+
+ if ( estimate )
+ size += avgObjSize;
+ else
+ size += c->currLoc().rec()->netLength();
+
+ numObjects++;
+
+ if ( ( maxSize && size > maxSize ) ||
+ ( maxObjects && numObjects > maxObjects ) ) {
+ result.appendBool( "maxReached" , true );
+ break;
+ }
+
+ c->advance();
+ }
+
+ ostringstream os;
+ os << "Finding size for ns: " << ns;
+ if ( ! min.isEmpty() ) {
+ os << " between " << min << " and " << max;
+ }
+ logIfSlow( timer , os.str() );
+
+ result.appendNumber( "size", size );
+ result.appendNumber( "numObjects" , numObjects );
+ result.append( "millis" , timer.millis() );
+ return true;
+ }
+ } cmdDatasize;
+
+ namespace {
+ long long getIndexSizeForCollection(string db, string ns, BSONObjBuilder* details=NULL, int scale = 1 ) {
+ d.dbMutex.assertAtLeastReadLocked();
+
+ NamespaceDetails * nsd = nsdetails( ns.c_str() );
+ if ( ! nsd )
+ return 0;
+
+ long long totalSize = 0;
+
+ NamespaceDetails::IndexIterator ii = nsd->ii();
+ while ( ii.more() ) {
+ IndexDetails& d = ii.next();
+ string collNS = d.indexNamespace();
+ NamespaceDetails * mine = nsdetails( collNS.c_str() );
+ if ( ! mine ) {
+ log() << "error: have index [" << collNS << "] but no NamespaceDetails" << endl;
+ continue;
+ }
+ totalSize += mine->stats.datasize;
+ if ( details )
+ details->appendNumber( d.indexName() , mine->stats.datasize / scale );
+ }
+ return totalSize;
+ }
+ }
+
+ class CollectionStats : public Command {
+ public:
+ CollectionStats() : Command( "collStats", false, "collstats" ) {}
+ virtual bool slaveOk() const { return true; }
+ virtual LockType locktype() const { return READ; }
+ virtual void help( stringstream &help ) const {
+ help << "{ collStats:\"blog.posts\" , scale : 1 } scale divides sizes e.g. for KB use 1024\n"
+ " avgObjSize - in bytes";
+ }
+ bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+ string ns = dbname + "." + jsobj.firstElement().valuestr();
+ Client::Context cx( ns );
+
+ NamespaceDetails * nsd = nsdetails( ns.c_str() );
+ if ( ! nsd ) {
+ errmsg = "ns not found";
+ return false;
+ }
+
+ result.append( "ns" , ns.c_str() );
+
+ int scale = 1;
+ if ( jsobj["scale"].isNumber() ) {
+ scale = jsobj["scale"].numberInt();
+ if ( scale <= 0 ) {
+ errmsg = "scale has to be > 0";
+ return false;
+ }
+ }
+ else if ( jsobj["scale"].trueValue() ) {
+ errmsg = "scale has to be a number > 0";
+ return false;
+ }
+
+ bool verbose = jsobj["verbose"].trueValue();
+
+ long long size = nsd->stats.datasize / scale;
+ result.appendNumber( "count" , nsd->stats.nrecords );
+ result.appendNumber( "size" , size );
+ if( nsd->stats.nrecords )
+ result.append ( "avgObjSize" , double(size) / double(nsd->stats.nrecords) );
+
+ int numExtents;
+ BSONArrayBuilder extents;
+
+ result.appendNumber( "storageSize" , nsd->storageSize( &numExtents , verbose ? &extents : 0 ) / scale );
+ result.append( "numExtents" , numExtents );
+ result.append( "nindexes" , nsd->nIndexes );
+ result.append( "lastExtentSize" , nsd->lastExtentSize / scale );
+ result.append( "paddingFactor" , nsd->paddingFactor );
+ result.append( "flags" , nsd->flags );
+
+ BSONObjBuilder indexSizes;
+ result.appendNumber( "totalIndexSize" , getIndexSizeForCollection(dbname, ns, &indexSizes, scale) / scale );
+ result.append("indexSizes", indexSizes.obj());
+
+ if ( nsd->capped ) {
+ result.append( "capped" , nsd->capped );
+ result.append( "max" , nsd->max );
+ }
+
+ if ( verbose )
+ result.appendArray( "extents" , extents.arr() );
+
+ return true;
+ }
+ } cmdCollectionStats;
+
+ class DBStats : public Command {
+ public:
+ DBStats() : Command( "dbStats", false, "dbstats" ) {}
+ virtual bool slaveOk() const { return true; }
+ virtual LockType locktype() const { return READ; }
+ virtual void help( stringstream &help ) const {
+ help <<
+ "Get stats on a database. Not instantaneous. Slower for databases with large .ns files.\n" <<
+ "Example: { dbStats:1, scale:1 }";
+ }
+ bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+ int scale = 1;
+ if ( jsobj["scale"].isNumber() ) {
+ scale = jsobj["scale"].numberInt();
+ if ( scale <= 0 ) {
+ errmsg = "scale has to be > 0";
+ return false;
+ }
+ }
+ else if ( jsobj["scale"].trueValue() ) {
+ errmsg = "scale has to be a number > 0";
+ return false;
+ }
+
+ list<string> collections;
+ Database* d = cc().database();
+ if ( d )
+ d->namespaceIndex.getNamespaces( collections );
+
+ long long ncollections = 0;
+ long long objects = 0;
+ long long size = 0;
+ long long storageSize = 0;
+ long long numExtents = 0;
+ long long indexes = 0;
+ long long indexSize = 0;
+
+ for (list<string>::const_iterator it = collections.begin(); it != collections.end(); ++it) {
+ const string ns = *it;
+
+ NamespaceDetails * nsd = nsdetails( ns.c_str() );
+ if ( ! nsd ) {
+ errmsg = "missing ns: ";
+ errmsg += ns;
+ return false;
+ }
+
+ ncollections += 1;
+ objects += nsd->stats.nrecords;
+ size += nsd->stats.datasize;
+
+ int temp;
+ storageSize += nsd->storageSize( &temp );
+ numExtents += temp;
+
+ indexes += nsd->nIndexes;
+ indexSize += getIndexSizeForCollection(dbname, ns);
+ }
+
+ result.append ( "db" , dbname );
+ result.appendNumber( "collections" , ncollections );
+ result.appendNumber( "objects" , objects );
+ result.append ( "avgObjSize" , objects == 0 ? 0 : double(size) / double(objects) );
+ result.appendNumber( "dataSize" , size / scale );
+ result.appendNumber( "storageSize" , storageSize / scale);
+ result.appendNumber( "numExtents" , numExtents );
+ result.appendNumber( "indexes" , indexes );
+ result.appendNumber( "indexSize" , indexSize / scale );
+ result.appendNumber( "fileSize" , d->fileSize() / scale );
+ if( d )
+ result.appendNumber( "nsSizeMB", (int) d->namespaceIndex.fileLength() / 1024 / 1024 );
+
+ return true;
+ }
+ } cmdDBStats;
+
+ /* convertToCapped seems to use this */
+ class CmdCloneCollectionAsCapped : public Command {
+ public:
+ CmdCloneCollectionAsCapped() : Command( "cloneCollectionAsCapped" ) {}
+ virtual bool slaveOk() const { return false; }
+ virtual LockType locktype() const { return WRITE; }
+ virtual void help( stringstream &help ) const {
+ help << "{ cloneCollectionAsCapped:<fromName>, toCollection:<toName>, size:<sizeInBytes> }";
+ }
+ bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+ string from = jsobj.getStringField( "cloneCollectionAsCapped" );
+ string to = jsobj.getStringField( "toCollection" );
+ long long size = (long long)jsobj.getField( "size" ).number();
+
+ if ( from.empty() || to.empty() || size == 0 ) {
+ errmsg = "invalid command spec";
+ return false;
+ }
+
+ string fromNs = dbname + "." + from;
+ string toNs = dbname + "." + to;
+ NamespaceDetails *nsd = nsdetails( fromNs.c_str() );
+ massert( 10301 , "source collection " + fromNs + " does not exist", nsd );
+ long long excessSize = nsd->stats.datasize - size * 2; // datasize and extentSize can't be compared exactly, so add some padding to 'size'
+ DiskLoc extent = nsd->firstExtent;
+ for( ; excessSize > extent.ext()->length && extent != nsd->lastExtent; extent = extent.ext()->xnext ) {
+ excessSize -= extent.ext()->length;
+ log( 2 ) << "cloneCollectionAsCapped skipping extent of size " << extent.ext()->length << endl;
+ log( 6 ) << "excessSize: " << excessSize << endl;
+ }
+ DiskLoc startLoc = extent.ext()->firstRecord;
+
+ CursorId id;
+ {
+ shared_ptr<Cursor> c = theDataFileMgr.findAll( fromNs.c_str(), startLoc );
+ ClientCursor *cc = new ClientCursor(0, c, fromNs.c_str());
+ id = cc->cursorid();
+ }
+
+ DBDirectClient client;
+ Client::Context ctx( toNs );
+ BSONObjBuilder spec;
+ spec.appendBool( "capped", true );
+ spec.append( "size", double( size ) );
+ if ( !userCreateNS( toNs.c_str(), spec.done(), errmsg, true ) )
+ return false;
+
+ auto_ptr< DBClientCursor > c = client.getMore( fromNs, id );
+ while( c->more() ) {
+ BSONObj obj = c->next();
+ theDataFileMgr.insertAndLog( toNs.c_str(), obj, true );
+ getDur().commitIfNeeded();
+ }
+
+ return true;
+ }
+ } cmdCloneCollectionAsCapped;
+
+ /* jan2010:
+ Converts the given collection to a capped collection w/ the specified size.
+ This command is not highly used, and is not currently supported with sharded
+ environments.
+ */
+ class CmdConvertToCapped : public Command {
+ public:
+ CmdConvertToCapped() : Command( "convertToCapped" ) {}
+ virtual bool slaveOk() const { return false; }
+ virtual LockType locktype() const { return WRITE; }
+ virtual void help( stringstream &help ) const {
+ help << "{ convertToCapped:<fromCollectionName>, size:<sizeInBytes> }";
+ }
+ bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+ BackgroundOperation::assertNoBgOpInProgForDb(dbname.c_str());
+
+ string from = jsobj.getStringField( "convertToCapped" );
+ long long size = (long long)jsobj.getField( "size" ).number();
+
+ if ( from.empty() || size == 0 ) {
+ errmsg = "invalid command spec";
+ return false;
+ }
+
+ string shortTmpName = str::stream() << ".tmp.convertToCapped." << from;
+ string longTmpName = str::stream() << dbname << "." << shortTmpName;
+
+ DBDirectClient client;
+ client.dropCollection( longTmpName );
+
+ BSONObj info;
+ if ( !client.runCommand( dbname ,
+ BSON( "cloneCollectionAsCapped" << from << "toCollection" << shortTmpName << "size" << double( size ) ),
+ info ) ) {
+ errmsg = "cloneCollectionAsCapped failed: " + info.toString();
+ return false;
+ }
+
+ if ( !client.dropCollection( dbname + "." + from ) ) {
+ errmsg = "failed to drop original collection";
+ return false;
+ }
+
+ if ( !client.runCommand( "admin",
+ BSON( "renameCollection" << longTmpName <<
+ "to" << ( dbname + "." + from ) ),
+ info ) ) {
+ errmsg = "renameCollection failed: " + info.toString();
+ return false;
+ }
+
+ return true;
+ }
+ } cmdConvertToCapped;
+
+ /* Returns client's uri */
+ class CmdWhatsMyUri : public Command {
+ public:
+ CmdWhatsMyUri() : Command("whatsmyuri") { }
+ virtual bool slaveOk() const {
+ return true;
+ }
+ virtual LockType locktype() const { return NONE; }
+ virtual void help( stringstream &help ) const {
+ help << "{whatsmyuri:1}";
+ }
+ virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+ BSONObj info = cc().curop()->infoNoauth();
+ result << "you" << info[ "client" ];
+ return true;
+ }
+ } cmdWhatsMyUri;
+
+ /* For testing only, not for general use */
+ class GodInsert : public Command {
+ public:
+ GodInsert() : Command( "godinsert" ) { }
+ virtual bool adminOnly() const { return false; }
+ virtual bool logTheOp() { return false; }
+ virtual bool slaveOk() const { return true; }
+ virtual LockType locktype() const { return NONE; }
+ virtual bool requiresAuth() { return true; }
+ virtual void help( stringstream &help ) const {
+ help << "internal. for testing only.";
+ }
+ virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+
+ AuthenticationInfo *ai = cc().getAuthenticationInfo();
+ if ( ! ai->isLocalHost ) {
+ errmsg = "godinsert only works locally";
+ return false;
+ }
+
+ string coll = cmdObj[ "godinsert" ].valuestrsafe();
+ log() << "test only command godinsert invoked coll:" << coll << endl;
+ uassert( 13049, "godinsert must specify a collection", !coll.empty() );
+ string ns = dbname + "." + coll;
+ BSONObj obj = cmdObj[ "obj" ].embeddedObjectUserCheck();
+ {
+ dblock lk;
+ Client::Context ctx( ns );
+ theDataFileMgr.insertWithObjMod( ns.c_str(), obj, true );
+ }
+ return true;
+ }
+ } cmdGodInsert;
+
+ class DBHashCmd : public Command {
+ public:
+ DBHashCmd() : Command( "dbHash", false, "dbhash" ) {}
+ virtual bool slaveOk() const { return true; }
+ virtual LockType locktype() const { return READ; }
+ virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+ list<string> colls;
+ Database* db = cc().database();
+ if ( db )
+ db->namespaceIndex.getNamespaces( colls );
+ colls.sort();
+
+ result.appendNumber( "numCollections" , (long long)colls.size() );
+ result.append( "host" , prettyHostName() );
+
+ md5_state_t globalState;
+ md5_init(&globalState);
+
+ BSONObjBuilder bb( result.subobjStart( "collections" ) );
+ for ( list<string>::iterator i=colls.begin(); i != colls.end(); i++ ) {
+ string c = *i;
+ if ( c.find( ".system.profil" ) != string::npos )
+ continue;
+
+ shared_ptr<Cursor> cursor;
+
+ NamespaceDetails * nsd = nsdetails( c.c_str() );
+
+ // debug SERVER-761
+ NamespaceDetails::IndexIterator ii = nsd->ii();
+ while( ii.more() ) {
+ const IndexDetails &idx = ii.next();
+ if ( !idx.head.isValid() || !idx.info.isValid() ) {
+ log() << "invalid index for ns: " << c << " " << idx.head << " " << idx.info;
+ if ( idx.info.isValid() )
+ log() << " " << idx.info.obj();
+ log() << endl;
+ }
+ }
+
+ int idNum = nsd->findIdIndex();
+ if ( idNum >= 0 ) {
+ cursor.reset( BtreeCursor::make( nsd , idNum , nsd->idx( idNum ) , BSONObj() , BSONObj() , false , 1 ) );
+ }
+ else if ( c.find( ".system." ) != string::npos ) {
+ continue;
+ }
+ else if ( nsd->capped ) {
+ cursor = findTableScan( c.c_str() , BSONObj() );
+ }
+ else {
+ log() << "can't find _id index for: " << c << endl;
+ continue;
+ }
+
+ md5_state_t st;
+ md5_init(&st);
+
+ long long n = 0;
+ while ( cursor->ok() ) {
+ BSONObj c = cursor->current();
+ md5_append( &st , (const md5_byte_t*)c.objdata() , c.objsize() );
+ n++;
+ cursor->advance();
+ }
+ md5digest d;
+ md5_finish(&st, d);
+ string hash = digestToString( d );
+
+ bb.append( c.c_str() + ( dbname.size() + 1 ) , hash );
+
+ md5_append( &globalState , (const md5_byte_t*)hash.c_str() , hash.size() );
+ }
+ bb.done();
+
+ md5digest d;
+ md5_finish(&globalState, d);
+ string hash = digestToString( d );
+
+ result.append( "md5" , hash );
+
+ return 1;
+ }
+
+ } dbhashCmd;
+
+ /* for diagnostic / testing purposes. */
+ class CmdSleep : public Command {
+ public:
+ virtual LockType locktype() const { return NONE; }
+ virtual bool adminOnly() const { return true; }
+ virtual bool logTheOp() { return false; }
+ virtual bool slaveOk() const { return true; }
+ virtual void help( stringstream& help ) const {
+ help << "internal testing command. Makes db block (in a read lock) for 100 seconds\n";
+ help << "w:true write lock. secs:<seconds>";
+ }
+ CmdSleep() : Command("sleep") { }
+ bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ log() << "test only command sleep invoked" << endl;
+ int secs = 100;
+ if ( cmdObj["secs"].isNumber() )
+ secs = cmdObj["secs"].numberInt();
+ if( cmdObj.getBoolField("w") ) {
+ writelock lk("");
+ sleepsecs(secs);
+ }
+ else {
+ readlock lk("");
+ sleepsecs(secs);
+ }
+ return true;
+ }
+ } cmdSleep;
+
+ // just for testing
+ class CapTrunc : public Command {
+ public:
+ CapTrunc() : Command( "captrunc" ) {}
+ virtual bool slaveOk() const { return false; }
+ virtual LockType locktype() const { return WRITE; }
+ virtual bool requiresAuth() { return true; }
+ virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+ string coll = cmdObj[ "captrunc" ].valuestrsafe();
+ uassert( 13416, "captrunc must specify a collection", !coll.empty() );
+ string ns = dbname + "." + coll;
+ int n = cmdObj.getIntField( "n" );
+
+ // inclusive range?
+ bool inc = cmdObj.getBoolField( "inc" );
+ NamespaceDetails *nsd = nsdetails( ns.c_str() );
+ ReverseCappedCursor c( nsd );
+ massert( 13417, "captrunc collection not found or empty", c.ok() );
+ for( int i = 0; i < n; ++i ) {
+ massert( 13418, "captrunc invalid n", c.advance() );
+ }
+ DiskLoc end = c.currLoc();
+ nsd->cappedTruncateAfter( ns.c_str(), end, inc );
+ return true;
+ }
+ } capTruncCmd;
+
+ // just for testing
+ class EmptyCapped : public Command {
+ public:
+ EmptyCapped() : Command( "emptycapped" ) {}
+ virtual bool slaveOk() const { return false; }
+ virtual LockType locktype() const { return WRITE; }
+ virtual bool requiresAuth() { return true; }
+ virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+ string coll = cmdObj[ "emptycapped" ].valuestrsafe();
+ uassert( 13428, "emptycapped must specify a collection", !coll.empty() );
+ string ns = dbname + "." + coll;
+ NamespaceDetails *nsd = nsdetails( ns.c_str() );
+ massert( 13429, "emptycapped no such collection", nsd );
+ nsd->emptyCappedCollection( ns.c_str() );
+ return true;
+ }
+ } emptyCappedCmd;
+
+ bool _execCommand(Command *c, const string& dbname, BSONObj& cmdObj, int queryOptions, BSONObjBuilder& result, bool fromRepl) {
+
+ try {
+ string errmsg;
+ if ( ! c->run(dbname, cmdObj, queryOptions, errmsg, result, fromRepl ) ) {
+ result.append( "errmsg" , errmsg );
+ return false;
+ }
+ }
+ catch ( SendStaleConfigException& e ){
+ log(1) << "command failed because of stale config, can retry" << causedBy( e ) << endl;
+ throw;
+ }
+ catch ( DBException& e ) {
+
+ // TODO: Rethrown errors have issues here, should divorce SendStaleConfigException from the DBException tree
+
+ stringstream ss;
+ ss << "exception: " << e.what();
+ result.append( "errmsg" , ss.str() );
+ result.append( "code" , e.getCode() );
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * this handles
+ - auth
+ - maintenance mode
+ - locking
+ - context
+ then calls run()
+ */
+ bool execCommand( Command * c ,
+ Client& client , int queryOptions ,
+ const char *cmdns, BSONObj& cmdObj ,
+ BSONObjBuilder& result,
+ bool fromRepl ) {
+
+ string dbname = nsToDatabase( cmdns );
+
+ AuthenticationInfo *ai = client.getAuthenticationInfo();
+
+ if( c->adminOnly() && c->localHostOnlyIfNoAuth( cmdObj ) && noauth && !ai->isLocalHost ) {
+ result.append( "errmsg" ,
+ "unauthorized: this command must run from localhost when running db without auth" );
+ log() << "command denied: " << cmdObj.toString() << endl;
+ return false;
+ }
+
+ if ( c->adminOnly() && ! fromRepl && dbname != "admin" ) {
+ result.append( "errmsg" , "access denied; use admin db" );
+ log() << "command denied: " << cmdObj.toString() << endl;
+ return false;
+ }
+
+ if ( cmdObj["help"].trueValue() ) {
+ client.curop()->ensureStarted();
+ stringstream ss;
+ ss << "help for: " << c->name << " ";
+ c->help( ss );
+ result.append( "help" , ss.str() );
+ result.append( "lockType" , c->locktype() );
+ return true;
+ }
+
+ bool canRunHere =
+ isMaster( dbname.c_str() ) ||
+ c->slaveOk() ||
+ ( c->slaveOverrideOk() && ( queryOptions & QueryOption_SlaveOk ) ) ||
+ fromRepl;
+
+ if ( ! canRunHere ) {
+ result.append( "errmsg" , "not master" );
+ result.append( "note" , "from execCommand" );
+ return false;
+ }
+
+ if ( ! c->maintenanceOk() && theReplSet && ! isMaster( dbname.c_str() ) && ! theReplSet->isSecondary() ) {
+ result.append( "errmsg" , "node is recovering" );
+ result.append( "note" , "from execCommand" );
+ return false;
+ }
+
+ if ( c->adminOnly() )
+ log( 2 ) << "command: " << cmdObj << endl;
+
+ if (c->maintenanceMode() && theReplSet && theReplSet->isSecondary()) {
+ theReplSet->setMaintenanceMode(true);
+ }
+
+ bool retval = false;
+ if ( c->locktype() == Command::NONE ) {
+ // we also trust that this won't crash
+ retval = true;
+
+ if ( c->requiresAuth() ) {
+ // test that the user at least as read permissions
+ if ( ! client.getAuthenticationInfo()->isAuthorizedReads( dbname ) ) {
+ result.append( "errmsg" , "need to login" );
+ retval = false;
+ }
+ }
+
+ if (retval) {
+ client.curop()->ensureStarted();
+ retval = _execCommand(c, dbname , cmdObj , queryOptions, result , fromRepl );
+ }
+ }
+ else if( c->locktype() != Command::WRITE ) {
+ // read lock
+ assert( ! c->logTheOp() );
+ string ns = c->parseNs(dbname, cmdObj);
+ Client::ReadContext ctx( ns , dbpath, c->requiresAuth() ); // read locks
+ client.curop()->ensureStarted();
+ retval = _execCommand(c, dbname , cmdObj , queryOptions, result , fromRepl );
+ }
+ else {
+ dassert( c->locktype() == Command::WRITE );
+ writelock lk;
+ client.curop()->ensureStarted();
+ Client::Context ctx( dbname , dbpath , c->requiresAuth() );
+ retval = _execCommand(c, dbname , cmdObj , queryOptions, result , fromRepl );
+ if ( retval && c->logTheOp() && ! fromRepl ) {
+ logOp("c", cmdns, cmdObj);
+ }
+ }
+
+ if (c->maintenanceMode() && theReplSet) {
+ theReplSet->setMaintenanceMode(false);
+ }
+
+ return retval;
+ }
+
+
+ /* TODO make these all command objects -- legacy stuff here
+
+ usage:
+ abc.$cmd.findOne( { ismaster:1 } );
+
+ returns true if ran a cmd
+ */
+ bool _runCommands(const char *ns, BSONObj& _cmdobj, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions) {
+ string dbname = nsToDatabase( ns );
+
+ if( logLevel >= 1 )
+ log() << "run command " << ns << ' ' << _cmdobj << endl;
+
+ const char *p = strchr(ns, '.');
+ if ( !p ) return false;
+ if ( strcmp(p, ".$cmd") != 0 ) return false;
+
+ BSONObj jsobj;
+ {
+ BSONElement e = _cmdobj.firstElement();
+ if ( e.type() == Object && (e.fieldName()[0] == '$'
+ ? str::equals("query", e.fieldName()+1)
+ : str::equals("query", e.fieldName())))
+ {
+ jsobj = e.embeddedObject();
+ }
+ else {
+ jsobj = _cmdobj;
+ }
+ }
+
+ Client& client = cc();
+ bool ok = false;
+
+ BSONElement e = jsobj.firstElement();
+
+ Command * c = e.type() ? Command::findCommand( e.fieldName() ) : 0;
+
+ if ( c ) {
+ ok = execCommand( c , client , queryOptions , ns , jsobj , anObjBuilder , fromRepl );
+ }
+ else {
+ anObjBuilder.append("errmsg", str::stream() << "no such cmd: " << e.fieldName() );
+ anObjBuilder.append("bad cmd" , _cmdobj );
+ }
+
+ // switch to bool, but wait a bit longer before switching?
+ // anObjBuilder.append("ok", ok);
+ anObjBuilder.append("ok", ok?1.0:0.0);
+ BSONObj x = anObjBuilder.done();
+ b.appendBuf((void*) x.objdata(), x.objsize());
+
+ return true;
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/dbcommands_admin.cpp b/src/mongo/db/dbcommands_admin.cpp
new file mode 100644
index 00000000000..ffcc3f261fe
--- /dev/null
+++ b/src/mongo/db/dbcommands_admin.cpp
@@ -0,0 +1,550 @@
+// dbcommands_admin.cpp
+
+/**
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ this file has dbcommands that are for dba type administration
+ mostly around dbs and collections
+ NOT system stuff
+*/
+
+
+#include "pch.h"
+#include "jsobj.h"
+#include "pdfile.h"
+#include "namespace-inl.h"
+#include "commands.h"
+#include "cmdline.h"
+#include "btree.h"
+#include "curop-inl.h"
+#include "../util/background.h"
+#include "../util/logfile.h"
+#include "../util/alignedbuilder.h"
+#include "../util/paths.h"
+#include "../scripting/engine.h"
+#include "../util/timer.h"
+
+namespace mongo {
+
+ class CleanCmd : public Command {
+ public:
+ CleanCmd() : Command( "clean" ) {}
+
+ virtual bool slaveOk() const { return true; }
+ virtual LockType locktype() const { return WRITE; }
+
+ virtual void help(stringstream& h) const { h << "internal"; }
+
+ bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+ string dropns = dbname + "." + cmdObj.firstElement().valuestrsafe();
+
+ if ( !cmdLine.quiet )
+ tlog() << "CMD: clean " << dropns << endl;
+
+ NamespaceDetails *d = nsdetails(dropns.c_str());
+
+ if ( ! d ) {
+ errmsg = "ns not found";
+ return 0;
+ }
+
+ for ( int i = 0; i < Buckets; i++ )
+ d->deletedList[i].Null();
+
+ result.append("ns", dropns.c_str());
+ return 1;
+ }
+
+ } cleanCmd;
+
+ namespace dur {
+ boost::filesystem::path getJournalDir();
+ }
+
+ class JournalLatencyTestCmd : public Command {
+ public:
+ JournalLatencyTestCmd() : Command( "journalLatencyTest" ) {}
+
+ virtual bool slaveOk() const { return true; }
+ virtual LockType locktype() const { return NONE; }
+ virtual bool adminOnly() const { return true; }
+ virtual void help(stringstream& h) const { h << "test how long to write and fsync to a test file in the journal/ directory"; }
+
+ bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+ boost::filesystem::path p = dur::getJournalDir();
+ p /= "journalLatencyTest";
+
+ // remove file if already present
+ try {
+ remove(p);
+ }
+ catch(...) { }
+
+ BSONObjBuilder bb[2];
+ for( int pass = 0; pass < 2; pass++ ) {
+ LogFile f(p.string());
+ AlignedBuilder b(1024 * 1024);
+ {
+ Timer t;
+ for( int i = 0 ; i < 100; i++ ) {
+ f.synchronousAppend(b.buf(), 8192);
+ }
+ bb[pass].append("8KB", t.millis() / 100.0);
+ }
+ {
+ const int N = 50;
+ Timer t2;
+ long long x = 0;
+ for( int i = 0 ; i < N; i++ ) {
+ Timer t;
+ f.synchronousAppend(b.buf(), 8192);
+ x += t.micros();
+ sleepmillis(4);
+ }
+ long long y = t2.micros() - 4*N*1000;
+ // not really trusting the timer granularity on all platforms so whichever is higher of x and y
+ bb[pass].append("8KBWithPauses", max(x,y) / (N*1000.0));
+ }
+ {
+ Timer t;
+ for( int i = 0 ; i < 20; i++ ) {
+ f.synchronousAppend(b.buf(), 1024 * 1024);
+ }
+ bb[pass].append("1MB", t.millis() / 20.0);
+ }
+ // second time around, we are prealloced.
+ }
+ result.append("timeMillis", bb[0].obj());
+ result.append("timeMillisWithPrealloc", bb[1].obj());
+
+ try {
+ remove(p);
+ }
+ catch(...) { }
+
+ try {
+ result.append("onSamePartition", onSamePartition(dur::getJournalDir().string(), dbpath));
+ }
+ catch(...) { }
+
+ return 1;
+ }
+ } journalLatencyTestCmd;
+
+ class ValidateCmd : public Command {
+ public:
+ ValidateCmd() : Command( "validate" ) {}
+
+ virtual bool slaveOk() const {
+ return true;
+ }
+
+ virtual void help(stringstream& h) const { h << "Validate contents of a namespace by scanning its data structures for correctness. Slow.\n"
+ "Add full:true option to do a more thorough check"; }
+
+ virtual LockType locktype() const { return READ; }
+ //{ validate: "collectionnamewithoutthedbpart" [, scandata: <bool>] [, full: <bool> } */
+
+ bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+ string ns = dbname + "." + cmdObj.firstElement().valuestrsafe();
+ NamespaceDetails * d = nsdetails( ns.c_str() );
+ if ( !cmdLine.quiet )
+ tlog() << "CMD: validate " << ns << endl;
+
+ if ( ! d ) {
+ errmsg = "ns not found";
+ return 0;
+ }
+
+ result.append( "ns", ns );
+ validateNS( ns.c_str() , d, cmdObj, result);
+ return 1;
+ }
+
+ private:
+ void validateNS(const char *ns, NamespaceDetails *d, const BSONObj& cmdObj, BSONObjBuilder& result) {
+ const bool full = cmdObj["full"].trueValue();
+ const bool scanData = full || cmdObj["scandata"].trueValue();
+
+ bool valid = true;
+ BSONArrayBuilder errors; // explanation(s) for why valid = false
+ if ( d->capped ){
+ result.append("capped", d->capped);
+ result.append("max", d->max);
+ }
+
+ result.append("firstExtent", str::stream() << d->firstExtent.toString() << " ns:" << d->firstExtent.ext()->nsDiagnostic.toString());
+ result.append( "lastExtent", str::stream() << d->lastExtent.toString() << " ns:" << d->lastExtent.ext()->nsDiagnostic.toString());
+
+ BSONArrayBuilder extentData;
+
+ try {
+ d->firstExtent.ext()->assertOk();
+ d->lastExtent.ext()->assertOk();
+
+ DiskLoc el = d->firstExtent;
+ int ne = 0;
+ while( !el.isNull() ) {
+ Extent *e = el.ext();
+ e->assertOk();
+ el = e->xnext;
+ ne++;
+ if ( full )
+ extentData << e->dump();
+
+ killCurrentOp.checkForInterrupt();
+ }
+ result.append("extentCount", ne);
+ }
+ catch (...) {
+ valid=false;
+ errors << "extent asserted";
+ }
+
+ if ( full )
+ result.appendArray( "extents" , extentData.arr() );
+
+
+ result.appendNumber("datasize", d->stats.datasize);
+ result.appendNumber("nrecords", d->stats.nrecords);
+ result.appendNumber("lastExtentSize", d->lastExtentSize);
+ result.appendNumber("padding", d->paddingFactor);
+
+
+ try {
+
+ try {
+ result.append("firstExtentDetails", d->firstExtent.ext()->dump());
+
+ valid = valid && d->firstExtent.ext()->validates() &&
+ d->firstExtent.ext()->xprev.isNull();
+ }
+ catch (...) {
+ errors << "exception firstextent";
+ valid = false;
+ }
+
+ set<DiskLoc> recs;
+ if( scanData ) {
+ shared_ptr<Cursor> c = theDataFileMgr.findAll(ns);
+ int n = 0;
+ int nInvalid = 0;
+ long long len = 0;
+ long long nlen = 0;
+ int outOfOrder = 0;
+ DiskLoc cl_last;
+ while ( c->ok() ) {
+ n++;
+
+ DiskLoc cl = c->currLoc();
+ if ( n < 1000000 )
+ recs.insert(cl);
+ if ( d->capped ) {
+ if ( cl < cl_last )
+ outOfOrder++;
+ cl_last = cl;
+ }
+
+ Record *r = c->_current();
+ len += r->lengthWithHeaders;
+ nlen += r->netLength();
+
+ if (full){
+ BSONObj obj(r);
+ if (!obj.isValid() || !obj.valid()){ // both fast and deep checks
+ valid = false;
+ if (nInvalid == 0) // only log once;
+ errors << "invalid bson object detected (see logs for more info)";
+
+ nInvalid++;
+ if (strcmp("_id", obj.firstElementFieldName()) == 0){
+ try {
+ obj.firstElement().validate(); // throws on error
+ log() << "Invalid bson detected in " << ns << " with _id: " << obj.firstElement().toString(false) << endl;
+ }
+ catch(...){
+ log() << "Invalid bson detected in " << ns << " with corrupt _id" << endl;
+ }
+ }
+ else {
+ log() << "Invalid bson detected in " << ns << " and couldn't find _id" << endl;
+ }
+ }
+ }
+
+ c->advance();
+ }
+ if ( d->capped && !d->capLooped() ) {
+ result.append("cappedOutOfOrder", outOfOrder);
+ if ( outOfOrder > 1 ) {
+ valid = false;
+ errors << "too many out of order records";
+ }
+ }
+ result.append("objectsFound", n);
+
+ if (full) {
+ result.append("invalidObjects", nInvalid);
+ }
+
+ result.appendNumber("bytesWithHeaders", len);
+ result.appendNumber("bytesWithoutHeaders", nlen);
+ }
+
+ BSONArrayBuilder deletedListArray;
+ for ( int i = 0; i < Buckets; i++ ) {
+ deletedListArray << d->deletedList[i].isNull();
+ }
+
+ int ndel = 0;
+ long long delSize = 0;
+ int incorrect = 0;
+ for ( int i = 0; i < Buckets; i++ ) {
+ DiskLoc loc = d->deletedList[i];
+ try {
+ int k = 0;
+ while ( !loc.isNull() ) {
+ if ( recs.count(loc) )
+ incorrect++;
+ ndel++;
+
+ if ( loc.questionable() ) {
+ if( d->capped && !loc.isValid() && i == 1 ) {
+ /* the constructor for NamespaceDetails intentionally sets deletedList[1] to invalid
+ see comments in namespace.h
+ */
+ break;
+ }
+
+ if ( loc.a() <= 0 || strstr(ns, "hudsonSmall") == 0 ) {
+ string err (str::stream() << "bad deleted loc: " << loc.toString() << " bucket:" << i << " k:" << k);
+ errors << err;
+
+ valid = false;
+ break;
+ }
+ }
+
+ DeletedRecord *d = loc.drec();
+ delSize += d->lengthWithHeaders;
+ loc = d->nextDeleted;
+ k++;
+ killCurrentOp.checkForInterrupt();
+ }
+ }
+ catch (...) {
+ errors << ("exception in deleted chain for bucket " + BSONObjBuilder::numStr(i));
+ valid = false;
+ }
+ }
+ result.appendNumber("deletedCount", ndel);
+ result.appendNumber("deletedSize", delSize);
+
+ if ( incorrect ) {
+ errors << (BSONObjBuilder::numStr(incorrect) + " records from datafile are in deleted list");
+ valid = false;
+ }
+
+ int idxn = 0;
+ try {
+ result.append("nIndexes", d->nIndexes);
+ BSONObjBuilder indexes; // not using subObjStart to be exception safe
+ NamespaceDetails::IndexIterator i = d->ii();
+ while( i.more() ) {
+ IndexDetails& id = i.next();
+ long long keys = id.idxInterface().fullValidate(id.head, id.keyPattern());
+ indexes.appendNumber(id.indexNamespace(), keys);
+ }
+ result.append("keysPerIndex", indexes.done());
+ }
+ catch (...) {
+ errors << ("exception during index validate idxn " + BSONObjBuilder::numStr(idxn));
+ valid=false;
+ }
+
+ }
+ catch (AssertionException) {
+ errors << "exception during validate";
+ valid = false;
+ }
+
+ result.appendBool("valid", valid);
+ result.append("errors", errors.arr());
+
+ if ( !full ){
+ result.append("warning", "Some checks omitted for speed. use {full:true} option to do more thorough scan.");
+ }
+
+ if ( !valid ) {
+ result.append("advice", "ns corrupt, requires repair");
+ }
+
+ }
+ } validateCmd;
+
+ bool lockedForWriting = false; // read from db/instance.cpp
+ static bool unlockRequested = false;
+ static mongo::mutex fsyncLockMutex("fsyncLock");
+ static boost::condition fsyncLockCondition;
+ static OID fsyncLockID; // identifies the current lock job
+
+ /*
+ class UnlockCommand : public Command {
+ public:
+ UnlockCommand() : Command( "unlock" ) { }
+ virtual bool readOnly() { return true; }
+ virtual bool slaveOk() const { return true; }
+ virtual bool adminOnly() const { return true; }
+ virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ if( lockedForWriting ) {
+ log() << "command: unlock requested" << endl;
+ errmsg = "unlock requested";
+ unlockRequested = true;
+ }
+ else {
+ errmsg = "not locked, so cannot unlock";
+ return 0;
+ }
+ return 1;
+ }
+
+ } unlockCommand;
+ */
+ /* see unlockFsync() for unlocking:
+ db.$cmd.sys.unlock.findOne()
+ */
+ class FSyncCommand : public Command {
+ static const char* url() { return "http://www.mongodb.org/display/DOCS/fsync+Command"; }
+ class LockDBJob : public BackgroundJob {
+ protected:
+ virtual string name() const { return "lockdbjob"; }
+ void run() {
+ Client::initThread("fsyncjob");
+ Client& c = cc();
+ {
+ scoped_lock lk(fsyncLockMutex);
+ while (lockedForWriting){ // there is a small window for two LockDBJob's to be active. This prevents it.
+ fsyncLockCondition.wait(lk.boost());
+ }
+ lockedForWriting = true;
+ fsyncLockID.init();
+ }
+ readlock lk("");
+ MemoryMappedFile::flushAll(true);
+ log() << "db is now locked for snapshotting, no writes allowed. db.fsyncUnlock() to unlock" << endl;
+ log() << " For more info see " << FSyncCommand::url() << endl;
+ _ready = true;
+ {
+ scoped_lock lk(fsyncLockMutex);
+ while( !unlockRequested ) {
+ fsyncLockCondition.wait(lk.boost());
+ }
+ unlockRequested = false;
+ lockedForWriting = false;
+ fsyncLockCondition.notify_all();
+ }
+ c.shutdown();
+ }
+ public:
+ bool& _ready;
+ LockDBJob(bool& ready) : BackgroundJob( true /* delete self */ ), _ready(ready) {
+ _ready = false;
+ }
+ };
+ public:
+ FSyncCommand() : Command( "fsync" ) {}
+ virtual LockType locktype() const { return WRITE; }
+ virtual bool slaveOk() const { return true; }
+ virtual bool adminOnly() const { return true; }
+ /*virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) {
+ string x = cmdObj["exec"].valuestrsafe();
+ return !x.empty();
+ }*/
+ virtual void help(stringstream& h) const { h << url(); }
+ virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ bool sync = !cmdObj["async"].trueValue(); // async means do an fsync, but return immediately
+ bool lock = cmdObj["lock"].trueValue();
+ log() << "CMD fsync: sync:" << sync << " lock:" << lock << endl;
+
+ if( lock ) {
+ // fsync and lock variation
+
+ uassert(12034, "fsync: can't lock while an unlock is pending", !unlockRequested);
+ uassert(12032, "fsync: sync option must be true when using lock", sync);
+ /* With releaseEarly(), we must be extremely careful we don't do anything
+ where we would have assumed we were locked. profiling is one of those things.
+ Perhaps at profile time we could check if we released early -- however,
+ we need to be careful to keep that code very fast it's a very common code path when on.
+ */
+ uassert(12033, "fsync: profiling must be off to enter locked mode", cc().database()->profile == 0);
+
+ // todo future: Perhaps we could do this in the background thread. As is now, writes may interleave between
+ // the releaseEarly below and the acquisition of the readlock in the background thread.
+ // However the real problem is that it seems complex to unlock here and then have a window for
+ // writes before the bg job -- can be done correctly but harder to reason about correctness.
+ // If this command ran within a read lock in the first place, would it work, and then that
+ // would be quite easy?
+ // Or, could we downgrade the write lock to a read lock, wait for ready, then release?
+ getDur().syncDataAndTruncateJournal();
+
+ bool ready = false;
+ LockDBJob *l = new LockDBJob(ready);
+
+ d.dbMutex.releaseEarly();
+
+ // There is a narrow window for another lock request to come in
+ // here before the LockDBJob grabs the readlock. LockDBJob will
+ // ensure that the requests are serialized and never running
+ // concurrently
+
+ l->go();
+ // don't return until background thread has acquired the read lock
+ while( !ready ) {
+ sleepmillis(10);
+ }
+ result.append("info", "now locked against writes, use db.fsyncUnlock() to unlock");
+ result.append("seeAlso", url());
+ }
+ else {
+ // the simple fsync command case
+
+ if (sync)
+ getDur().commitNow();
+ result.append( "numFiles" , MemoryMappedFile::flushAll( sync ) );
+ }
+ return 1;
+ }
+
+ } fsyncCmd;
+
+ // Note that this will only unlock the current lock. If another thread
+ // relocks before we return we still consider the unlocking successful.
+ // This is imporant because if two scripts are trying to fsync-lock, each
+ // one must be assured that between the fsync return and the call to unlock
+ // that the database is fully locked
+ void unlockFsyncAndWait(){
+ scoped_lock lk(fsyncLockMutex);
+ if (lockedForWriting) { // could have handled another unlock before we grabbed the lock
+ OID curOp = fsyncLockID;
+ unlockRequested = true;
+ fsyncLockCondition.notify_all();
+ while (lockedForWriting && fsyncLockID == curOp){
+ fsyncLockCondition.wait( lk.boost() );
+ }
+ }
+ }
+}
+
diff --git a/src/mongo/db/dbcommands_generic.cpp b/src/mongo/db/dbcommands_generic.cpp
new file mode 100644
index 00000000000..cfd833aa72d
--- /dev/null
+++ b/src/mongo/db/dbcommands_generic.cpp
@@ -0,0 +1,432 @@
+/** @file dbcommands_generic.cpp commands suited for any mongo server (both mongod, mongos) */
+
+/**
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "ops/query.h"
+#include "pdfile.h"
+#include "jsobj.h"
+#include "../bson/util/builder.h"
+#include <time.h>
+#include "introspect.h"
+#include "btree.h"
+#include "../util/lruishmap.h"
+#include "../util/md5.hpp"
+#include "../util/processinfo.h"
+#include "json.h"
+#include "repl.h"
+#include "repl_block.h"
+#include "replutil.h"
+#include "commands.h"
+#include "db.h"
+#include "instance.h"
+#include "lasterror.h"
+#include "security.h"
+#include "../scripting/engine.h"
+#include "stats/counters.h"
+#include "background.h"
+#include "../util/version.h"
+#include "../util/ramlog.h"
+#include "repl/multicmd.h"
+#include "server.h"
+
+namespace mongo {
+
+#if 0
+ namespace cloud {
+ SimpleMutex mtx("cloud");
+ Guarded< vector<string>, mtx > ips;
+ bool startedThread = false;
+
+ void thread() {
+ bson::bo cmd;
+ while( 1 ) {
+ list<Target> L;
+ {
+ SimpleMutex::scoped_lock lk(mtx);
+ if( ips.ref(lk).empty() )
+ continue;
+ for( unsigned i = 0; i < ips.ref(lk).size(); i++ ) {
+ L.push_back( Target(ips.ref(lk)[i]) );
+ }
+ }
+
+
+ /** repoll as machines might be down on the first lookup (only if not found previously) */
+ sleepsecs(6);
+ }
+ }
+ }
+
+ class CmdCloud : public Command {
+ public:
+ CmdCloud() : Command( "cloud" ) { }
+ virtual bool slaveOk() const { return true; }
+ virtual bool adminOnly() const { return true; }
+ virtual LockType locktype() const { return NONE; }
+ virtual void help( stringstream &help ) const {
+ help << "internal command facilitating running in certain cloud computing environments";
+ }
+ bool run(const string& dbname, BSONObj& obj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+ if( !obj.hasElement("servers") ) {
+ vector<string> ips;
+ obj["servers"].Obj().Vals(ips);
+ {
+ SimpleMutex::scoped_lock lk(cloud::mtx);
+ cloud::ips.ref(lk).swap(ips);
+ if( !cloud::startedThread ) {
+ cloud::startedThread = true;
+ boost::thread thr(cloud::thread);
+ }
+ }
+ }
+ return true;
+ }
+ } cmdCloud;
+#endif
+
+ class CmdBuildInfo : public Command {
+ public:
+ CmdBuildInfo() : Command( "buildInfo", true, "buildinfo" ) {}
+ virtual bool slaveOk() const { return true; }
+ virtual bool adminOnly() const { return false; }
+ virtual bool requiresAuth() { return false; }
+ virtual LockType locktype() const { return NONE; }
+ virtual void help( stringstream &help ) const {
+ help << "get version #, etc.\n";
+ help << "{ buildinfo:1 }";
+ }
+ bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+ result << "version" << versionString << "gitVersion" << gitVersion() << "sysInfo" << sysInfo();
+ result << "versionArray" << versionArray;
+ result << "bits" << ( sizeof( int* ) == 4 ? 32 : 64 );
+ result.appendBool( "debug" , debug );
+ result.appendNumber("maxBsonObjectSize", BSONObjMaxUserSize);
+ return true;
+ }
+ } cmdBuildInfo;
+
+ /** experimental. either remove or add support in repl sets also. in a repl set, getting this setting from the
+ repl set config could make sense.
+ */
+ unsigned replApplyBatchSize = 1;
+
+ class CmdGet : public Command {
+ public:
+ CmdGet() : Command( "getParameter" ) { }
+ virtual bool slaveOk() const { return true; }
+ virtual bool adminOnly() const { return true; }
+ virtual LockType locktype() const { return NONE; }
+ virtual void help( stringstream &help ) const {
+ help << "get administrative option(s)\nexample:\n";
+ help << "{ getParameter:1, notablescan:1 }\n";
+ help << "supported so far:\n";
+ help << " quiet\n";
+ help << " notablescan\n";
+ help << " logLevel\n";
+ help << " syncdelay\n";
+ help << "{ getParameter:'*' } to get everything\n";
+ }
+ bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+ bool all = *cmdObj.firstElement().valuestrsafe() == '*';
+
+ int before = result.len();
+
+ if( all || cmdObj.hasElement("quiet") ) {
+ result.append("quiet", cmdLine.quiet );
+ }
+ if( all || cmdObj.hasElement("notablescan") ) {
+ result.append("notablescan", cmdLine.noTableScan);
+ }
+ if( all || cmdObj.hasElement("logLevel") ) {
+ result.append("logLevel", logLevel);
+ }
+ if( all || cmdObj.hasElement("syncdelay") ) {
+ result.append("syncdelay", cmdLine.syncdelay);
+ }
+ if( all || cmdObj.hasElement("replApplyBatchSize") ) {
+ result.append("replApplyBatchSize", replApplyBatchSize);
+ }
+
+ if ( before == result.len() ) {
+ errmsg = "no option found to get";
+ return false;
+ }
+ return true;
+ }
+ } cmdGet;
+
+ // tempish
+ bool setParmsMongodSpecific(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl );
+
+ class CmdSet : public Command {
+ public:
+ CmdSet() : Command( "setParameter" ) { }
+ virtual bool slaveOk() const { return true; }
+ virtual bool adminOnly() const { return true; }
+ virtual LockType locktype() const { return NONE; }
+ virtual void help( stringstream &help ) const {
+ help << "set administrative option(s)\n";
+ help << "{ setParameter:1, <param>:<value> }\n";
+ help << "supported so far:\n";
+ help << " journalCommitInterval\n";
+ help << " logLevel\n";
+ help << " notablescan\n";
+ help << " quiet\n";
+ help << " syncdelay\n";
+ }
+ bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+ int s = 0;
+ bool found = setParmsMongodSpecific(dbname, cmdObj, errmsg, result, fromRepl);
+ if( cmdObj.hasElement("journalCommitInterval") ) {
+ if( !cmdLine.dur ) {
+ errmsg = "journaling is off";
+ return false;
+ }
+ int x = (int) cmdObj["journalCommitInterval"].Number();
+ assert( x > 1 && x < 500 );
+ cmdLine.journalCommitInterval = x;
+ log() << "setParameter journalCommitInterval=" << x << endl;
+ s++;
+ }
+ if( cmdObj.hasElement("notablescan") ) {
+ assert( !cmdLine.isMongos() );
+ if( s == 0 )
+ result.append("was", cmdLine.noTableScan);
+ cmdLine.noTableScan = cmdObj["notablescan"].Bool();
+ s++;
+ }
+ if( cmdObj.hasElement("quiet") ) {
+ if( s == 0 )
+ result.append("was", cmdLine.quiet );
+ cmdLine.quiet = cmdObj["quiet"].Bool();
+ s++;
+ }
+ if( cmdObj.hasElement("syncdelay") ) {
+ assert( !cmdLine.isMongos() );
+ if( s == 0 )
+ result.append("was", cmdLine.syncdelay );
+ cmdLine.syncdelay = cmdObj["syncdelay"].Number();
+ s++;
+ }
+ if( cmdObj.hasElement( "logLevel" ) ) {
+ if( s == 0 )
+ result.append("was", logLevel );
+ logLevel = cmdObj["logLevel"].numberInt();
+ s++;
+ }
+ if( cmdObj.hasElement( "replApplyBatchSize" ) ) {
+ if( s == 0 )
+ result.append("was", replApplyBatchSize );
+ BSONElement e = cmdObj["replApplyBatchSize"];
+ ParameterValidator * v = ParameterValidator::get( e.fieldName() );
+ assert( v );
+ if ( ! v->isValid( e , errmsg ) )
+ return false;
+ replApplyBatchSize = e.numberInt();
+ s++;
+ }
+ if( cmdObj.hasElement( "traceExceptions" ) ) {
+ if( s == 0 ) result.append( "was", DBException::traceExceptions );
+ DBException::traceExceptions = cmdObj["traceExceptions"].Bool();
+ s++;
+ }
+
+ if( s == 0 && !found ) {
+ errmsg = "no option found to set, use help:true to see options ";
+ return false;
+ }
+
+ return true;
+ }
+ } cmdSet;
+
+ class PingCommand : public Command {
+ public:
+ PingCommand() : Command( "ping" ) {}
+ virtual bool slaveOk() const { return true; }
+ virtual void help( stringstream &help ) const { help << "a way to check that the server is alive. responds immediately even if server is in a db lock."; }
+ virtual LockType locktype() const { return NONE; }
+ virtual bool requiresAuth() { return false; }
+ virtual bool run(const string& badns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+ // IMPORTANT: Don't put anything in here that might lock db - including authentication
+ return true;
+ }
+ } pingCmd;
+
+ class FeaturesCmd : public Command {
+ public:
+ FeaturesCmd() : Command( "features", true ) {}
+ void help(stringstream& h) const { h << "return build level feature settings"; }
+ virtual bool slaveOk() const { return true; }
+ virtual bool readOnly() { return true; }
+ virtual LockType locktype() const { return NONE; }
+ virtual bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ if ( globalScriptEngine ) {
+ BSONObjBuilder bb( result.subobjStart( "js" ) );
+ result.append( "utf8" , globalScriptEngine->utf8Ok() );
+ bb.done();
+ }
+ if ( cmdObj["oidReset"].trueValue() ) {
+ result.append( "oidMachineOld" , OID::getMachineId() );
+ OID::regenMachineId();
+ }
+ result.append( "oidMachine" , OID::getMachineId() );
+ return true;
+ }
+
+ } featuresCmd;
+
+ class LogRotateCmd : public Command {
+ public:
+ LogRotateCmd() : Command( "logRotate" ) {}
+ virtual LockType locktype() const { return NONE; }
+ virtual bool slaveOk() const { return true; }
+ virtual bool adminOnly() const { return true; }
+ virtual bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ rotateLogs();
+ return 1;
+ }
+
+ } logRotateCmd;
+
+ class ListCommandsCmd : public Command {
+ public:
+ virtual void help( stringstream &help ) const { help << "get a list of all db commands"; }
+ ListCommandsCmd() : Command( "listCommands", false ) {}
+ virtual LockType locktype() const { return NONE; }
+ virtual bool slaveOk() const { return true; }
+ virtual bool adminOnly() const { return false; }
+ virtual bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ BSONObjBuilder b( result.subobjStart( "commands" ) );
+ for ( map<string,Command*>::iterator i=_commands->begin(); i!=_commands->end(); ++i ) {
+ Command * c = i->second;
+
+ // don't show oldnames
+ if (i->first != c->name)
+ continue;
+
+ BSONObjBuilder temp( b.subobjStart( c->name ) );
+
+ {
+ stringstream help;
+ c->help( help );
+ temp.append( "help" , help.str() );
+ }
+ temp.append( "lockType" , c->locktype() );
+ temp.append( "slaveOk" , c->slaveOk() );
+ temp.append( "adminOnly" , c->adminOnly() );
+ //optionally indicates that the command can be forced to run on a slave/secondary
+ if ( c->slaveOverrideOk() ) temp.append( "slaveOverrideOk" , c->slaveOverrideOk() );
+ temp.done();
+ }
+ b.done();
+
+ return 1;
+ }
+
+ } listCommandsCmd;
+
+ bool CmdShutdown::shutdownHelper() {
+ Client * c = currentClient.get();
+ if ( c ) {
+ c->shutdown();
+ }
+
+ log() << "terminating, shutdown command received" << endl;
+
+ dbexit( EXIT_CLEAN , "shutdown called" , true ); // this never returns
+ assert(0);
+ return true;
+ }
+
+ /* for testing purposes only */
+ class CmdForceError : public Command {
+ public:
+ virtual void help( stringstream& help ) const {
+ help << "for testing purposes only. forces a user assertion exception";
+ }
+ virtual bool logTheOp() {
+ return false;
+ }
+ virtual bool slaveOk() const {
+ return true;
+ }
+ virtual LockType locktype() const { return NONE; }
+ CmdForceError() : Command("forceerror") {}
+ bool run(const string& dbnamne, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ uassert( 10038 , "forced error", false);
+ return true;
+ }
+ } cmdForceError;
+
+ class AvailableQueryOptions : public Command {
+ public:
+ AvailableQueryOptions() : Command( "availableQueryOptions" , false , "availablequeryoptions" ) {}
+ virtual bool slaveOk() const { return true; }
+ virtual LockType locktype() const { return NONE; }
+ virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+ result << "options" << QueryOption_AllSupported;
+ return true;
+ }
+ } availableQueryOptionsCmd;
+
+ class GetLogCmd : public Command {
+ public:
+ GetLogCmd() : Command( "getLog" ){}
+
+ virtual bool slaveOk() const { return true; }
+ virtual LockType locktype() const { return NONE; }
+ virtual bool adminOnly() const { return true; }
+
+ virtual void help( stringstream& help ) const {
+ help << "{ getLog : '*' } OR { getLog : 'global' }";
+ }
+
+ virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+ string p = cmdObj.firstElement().String();
+ if ( p == "*" ) {
+ vector<string> names;
+ RamLog::getNames( names );
+
+ BSONArrayBuilder arr;
+ for ( unsigned i=0; i<names.size(); i++ ) {
+ arr.append( names[i] );
+ }
+
+ result.appendArray( "names" , arr.arr() );
+ }
+ else {
+ RamLog* rl = RamLog::get( p );
+ if ( ! rl ) {
+ errmsg = str::stream() << "no RamLog named: " << p;
+ return false;
+ }
+
+ vector<const char*> lines;
+ rl->get( lines );
+
+ BSONArrayBuilder arr( result.subarrayStart( "log" ) );
+ for ( unsigned i=0; i<lines.size(); i++ )
+ arr.append( lines[i] );
+ arr.done();
+ }
+ return true;
+ }
+
+ } getLogCmd;
+
+}
diff --git a/src/mongo/db/dbeval.cpp b/src/mongo/db/dbeval.cpp
new file mode 100644
index 00000000000..9e77d8c8097
--- /dev/null
+++ b/src/mongo/db/dbeval.cpp
@@ -0,0 +1,136 @@
+/* commands.cpp
+ db "commands" (sent via db.$cmd.findOne(...))
+ */
+
+/**
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "ops/query.h"
+#include "pdfile.h"
+#include "jsobj.h"
+#include "../bson/util/builder.h"
+#include <time.h>
+#include "introspect.h"
+#include "btree.h"
+#include "../util/lruishmap.h"
+#include "json.h"
+#include "repl.h"
+#include "commands.h"
+#include "cmdline.h"
+
+#include "../scripting/engine.h"
+
+namespace mongo {
+
+ const int edebug=0;
+
+ bool dbEval(const string& dbName, BSONObj& cmd, BSONObjBuilder& result, string& errmsg) {
+ BSONElement e = cmd.firstElement();
+ uassert( 10046 , "eval needs Code" , e.type() == Code || e.type() == CodeWScope || e.type() == String );
+
+ const char *code = 0;
+ switch ( e.type() ) {
+ case String:
+ case Code:
+ code = e.valuestr();
+ break;
+ case CodeWScope:
+ code = e.codeWScopeCode();
+ break;
+ default:
+ assert(0);
+ }
+ assert( code );
+
+ if ( ! globalScriptEngine ) {
+ errmsg = "db side execution is disabled";
+ return false;
+ }
+
+ auto_ptr<Scope> s = globalScriptEngine->getPooledScope( dbName );
+ ScriptingFunction f = s->createFunction(code);
+ if ( f == 0 ) {
+ errmsg = (string)"compile failed: " + s->getError();
+ return false;
+ }
+
+ if ( e.type() == CodeWScope )
+ s->init( e.codeWScopeScopeData() );
+ s->localConnect( dbName.c_str() );
+
+ BSONObj args;
+ {
+ BSONElement argsElement = cmd.getField("args");
+ if ( argsElement.type() == Array ) {
+ args = argsElement.embeddedObject();
+ if ( edebug ) {
+ out() << "args:" << args.toString() << endl;
+ out() << "code:\n" << code << endl;
+ }
+ }
+ }
+
+ int res;
+ {
+ Timer t;
+ res = s->invoke(f, &args, 0, cmdLine.quota ? 10 * 60 * 1000 : 0 );
+ int m = t.millis();
+ if ( m > cmdLine.slowMS ) {
+ out() << "dbeval slow, time: " << dec << m << "ms " << dbName << endl;
+ if ( m >= 1000 ) log() << code << endl;
+ else OCCASIONALLY log() << code << endl;
+ }
+ }
+ if ( res ) {
+ result.append("errno", (double) res);
+ errmsg = "invoke failed: ";
+ errmsg += s->getError();
+ return false;
+ }
+
+ s->append( result , "retval" , "return" );
+
+ return true;
+ }
+
+ class CmdEval : public Command {
+ public:
+ virtual bool slaveOk() const {
+ return false;
+ }
+ virtual void help( stringstream &help ) const {
+ help << "Evaluate javascript at the server.\n" "http://www.mongodb.org/display/DOCS/Server-side+Code+Execution";
+ }
+ virtual LockType locktype() const { return NONE; }
+ CmdEval() : Command("eval", false, "$eval") { }
+ bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+
+ AuthenticationInfo *ai = cc().getAuthenticationInfo();
+ uassert( 12598 , "$eval reads unauthorized", ai->isAuthorizedReads(dbname.c_str()) );
+
+ if ( cmdObj["nolock"].trueValue() ) {
+ return dbEval(dbname, cmdObj, result, errmsg);
+ }
+
+ // write security will be enforced in DBDirectClient
+ mongolock lk( ai->isAuthorized( dbname.c_str() ) );
+ Client::Context ctx( dbname );
+
+ return dbEval(dbname, cmdObj, result, errmsg);
+ }
+ } cmdeval;
+
+} // namespace mongo
diff --git a/src/mongo/db/dbhelpers.cpp b/src/mongo/db/dbhelpers.cpp
new file mode 100644
index 00000000000..39540c9ce89
--- /dev/null
+++ b/src/mongo/db/dbhelpers.cpp
@@ -0,0 +1,353 @@
+// dbhelpers.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "db.h"
+#include "dbhelpers.h"
+#include "json.h"
+#include "queryoptimizer.h"
+#include "btree.h"
+#include "pdfile.h"
+#include "oplog.h"
+#include "ops/update.h"
+#include "ops/delete.h"
+
+namespace mongo {
+
+ void Helpers::ensureIndex(const char *ns, BSONObj keyPattern, bool unique, const char *name) {
+ NamespaceDetails *d = nsdetails(ns);
+ if( d == 0 )
+ return;
+
+ {
+ NamespaceDetails::IndexIterator i = d->ii();
+ while( i.more() ) {
+ if( i.next().keyPattern().woCompare(keyPattern) == 0 )
+ return;
+ }
+ }
+
+ if( d->nIndexes >= NamespaceDetails::NIndexesMax ) {
+ problem() << "Helper::ensureIndex fails, MaxIndexes exceeded " << ns << '\n';
+ return;
+ }
+
+ string system_indexes = cc().database()->name + ".system.indexes";
+
+ BSONObjBuilder b;
+ b.append("name", name);
+ b.append("ns", ns);
+ b.append("key", keyPattern);
+ b.appendBool("unique", unique);
+ BSONObj o = b.done();
+
+ theDataFileMgr.insert(system_indexes.c_str(), o.objdata(), o.objsize());
+ }
+
+ /* fetch a single object from collection ns that matches query
+ set your db SavedContext first
+ */
+ bool Helpers::findOne(const char *ns, const BSONObj &query, BSONObj& result, bool requireIndex) {
+ DiskLoc loc = findOne( ns, query, requireIndex );
+ if ( loc.isNull() )
+ return false;
+ result = loc.obj();
+ return true;
+ }
+
+ /* fetch a single object from collection ns that matches query
+ set your db SavedContext first
+ */
+ DiskLoc Helpers::findOne(const char *ns, const BSONObj &query, bool requireIndex) {
+ shared_ptr<Cursor> c = NamespaceDetailsTransient::getCursor( ns, query, BSONObj(), requireIndex );
+ while( c->ok() ) {
+ if ( c->currentMatches() && !c->getsetdup( c->currLoc() ) ) {
+ return c->currLoc();
+ }
+ c->advance();
+ }
+ return DiskLoc();
+ }
+
+ bool Helpers::findById(Client& c, const char *ns, BSONObj query, BSONObj& result ,
+ bool * nsFound , bool * indexFound ) {
+ d.dbMutex.assertAtLeastReadLocked();
+ Database *database = c.database();
+ assert( database );
+ NamespaceDetails *d = database->namespaceIndex.details(ns);
+ if ( ! d )
+ return false;
+ if ( nsFound )
+ *nsFound = 1;
+
+ int idxNo = d->findIdIndex();
+ if ( idxNo < 0 )
+ return false;
+ if ( indexFound )
+ *indexFound = 1;
+
+ IndexDetails& i = d->idx( idxNo );
+
+ BSONObj key = i.getKeyFromQuery( query );
+
+ DiskLoc loc = i.idxInterface().findSingle(i , i.head , key);
+ if ( loc.isNull() )
+ return false;
+ result = loc.obj();
+ return true;
+ }
+
+ DiskLoc Helpers::findById(NamespaceDetails *d, BSONObj idquery) {
+ assert(d);
+ int idxNo = d->findIdIndex();
+ uassert(13430, "no _id index", idxNo>=0);
+ IndexDetails& i = d->idx( idxNo );
+ BSONObj key = i.getKeyFromQuery( idquery );
+ return i.idxInterface().findSingle(i , i.head , key);
+ }
+
+ bool Helpers::isEmpty(const char *ns, bool doAuth) {
+ Client::Context context(ns, dbpath, doAuth);
+ shared_ptr<Cursor> c = DataFileMgr::findAll(ns);
+ return !c->ok();
+ }
+
+ /* Get the first object from a collection. Generally only useful if the collection
+ only ever has a single object -- which is a "singleton collection.
+
+ Returns: true if object exists.
+ */
+ bool Helpers::getSingleton(const char *ns, BSONObj& result) {
+ Client::Context context(ns);
+
+ shared_ptr<Cursor> c = DataFileMgr::findAll(ns);
+ if ( !c->ok() ) {
+ context.getClient()->curop()->done();
+ return false;
+ }
+
+ result = c->current();
+ context.getClient()->curop()->done();
+ return true;
+ }
+
+ bool Helpers::getLast(const char *ns, BSONObj& result) {
+ Client::Context ctx(ns);
+ shared_ptr<Cursor> c = findTableScan(ns, reverseNaturalObj);
+ if( !c->ok() )
+ return false;
+ result = c->current();
+ return true;
+ }
+
+ void Helpers::upsert( const string& ns , const BSONObj& o ) {
+ BSONElement e = o["_id"];
+ assert( e.type() );
+ BSONObj id = e.wrap();
+
+ OpDebug debug;
+ Client::Context context(ns);
+ updateObjects(ns.c_str(), o, /*pattern=*/id, /*upsert=*/true, /*multi=*/false , /*logtheop=*/true , debug );
+ }
+
+ void Helpers::putSingleton(const char *ns, BSONObj obj) {
+ OpDebug debug;
+ Client::Context context(ns);
+ updateObjects(ns, obj, /*pattern=*/BSONObj(), /*upsert=*/true, /*multi=*/false , /*logtheop=*/true , debug );
+ context.getClient()->curop()->done();
+ }
+
+ void Helpers::putSingletonGod(const char *ns, BSONObj obj, bool logTheOp) {
+ OpDebug debug;
+ Client::Context context(ns);
+ _updateObjects(/*god=*/true, ns, obj, /*pattern=*/BSONObj(), /*upsert=*/true, /*multi=*/false , logTheOp , debug );
+ context.getClient()->curop()->done();
+ }
+
+ BSONObj Helpers::toKeyFormat( const BSONObj& o , BSONObj& key ) {
+ BSONObjBuilder me;
+ BSONObjBuilder k;
+
+ BSONObjIterator i( o );
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ k.append( e.fieldName() , 1 );
+ me.appendAs( e , "" );
+ }
+ key = k.obj();
+ return me.obj();
+ }
+
+ long long Helpers::removeRange( const string& ns , const BSONObj& min , const BSONObj& max , bool yield , bool maxInclusive , RemoveCallback * callback ) {
+ BSONObj keya , keyb;
+ BSONObj minClean = toKeyFormat( min , keya );
+ BSONObj maxClean = toKeyFormat( max , keyb );
+ assert( keya == keyb );
+
+ Client::Context ctx(ns);
+ NamespaceDetails* nsd = nsdetails( ns.c_str() );
+ if ( ! nsd )
+ return 0;
+
+ int ii = nsd->findIndexByKeyPattern( keya );
+ assert( ii >= 0 );
+
+ long long num = 0;
+
+ IndexDetails& i = nsd->idx( ii );
+
+ shared_ptr<Cursor> c( BtreeCursor::make( nsd , ii , i , minClean , maxClean , maxInclusive, 1 ) );
+ auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) );
+ cc->setDoingDeletes( true );
+
+ while ( c->ok() ) {
+
+ if ( yield && ! cc->yieldSometimes( ClientCursor::WillNeed) ) {
+ // cursor got finished by someone else, so we're done
+ cc.release(); // if the collection/db is dropped, cc may be deleted
+ break;
+ }
+
+ if ( ! c->ok() )
+ break;
+
+ DiskLoc rloc = c->currLoc();
+
+ if ( callback )
+ callback->goingToDelete( c->current() );
+
+ c->advance();
+ c->noteLocation();
+
+ logOp( "d" , ns.c_str() , rloc.obj()["_id"].wrap() );
+ theDataFileMgr.deleteRecord(ns.c_str() , rloc.rec(), rloc);
+ num++;
+
+ c->checkLocation();
+
+ getDur().commitIfNeeded();
+
+
+ }
+
+ return num;
+ }
+
+ void Helpers::emptyCollection(const char *ns) {
+ Client::Context context(ns);
+ deleteObjects(ns, BSONObj(), false);
+ }
+
+ DbSet::~DbSet() {
+ if ( name_.empty() )
+ return;
+ try {
+ Client::Context c( name_.c_str() );
+ if ( nsdetails( name_.c_str() ) ) {
+ string errmsg;
+ BSONObjBuilder result;
+ dropCollection( name_, errmsg, result );
+ }
+ }
+ catch ( ... ) {
+ problem() << "exception cleaning up DbSet" << endl;
+ }
+ }
+
+ void DbSet::reset( const string &name, const BSONObj &key ) {
+ if ( !name.empty() )
+ name_ = name;
+ if ( !key.isEmpty() )
+ key_ = key.getOwned();
+ Client::Context c( name_.c_str() );
+ if ( nsdetails( name_.c_str() ) ) {
+ Helpers::emptyCollection( name_.c_str() );
+ }
+ else {
+ string err;
+ massert( 10303 , err, userCreateNS( name_.c_str(), fromjson( "{autoIndexId:false}" ), err, false ) );
+ }
+ Helpers::ensureIndex( name_.c_str(), key_, true, "setIdx" );
+ }
+
+ bool DbSet::get( const BSONObj &obj ) const {
+ Client::Context c( name_.c_str() );
+ BSONObj temp;
+ return Helpers::findOne( name_.c_str(), obj, temp, true );
+ }
+
+ void DbSet::set( const BSONObj &obj, bool val ) {
+ Client::Context c( name_.c_str() );
+ if ( val ) {
+ try {
+ BSONObj k = obj;
+ theDataFileMgr.insertWithObjMod( name_.c_str(), k, false );
+ }
+ catch ( DBException& ) {
+ // dup key - already in set
+ }
+ }
+ else {
+ deleteObjects( name_.c_str(), obj, true, false, false );
+ }
+ }
+
+ RemoveSaver::RemoveSaver( const string& a , const string& b , const string& why) : _out(0) {
+ static int NUM = 0;
+
+ _root = dbpath;
+ if ( a.size() )
+ _root /= a;
+ if ( b.size() )
+ _root /= b;
+ assert( a.size() || b.size() );
+
+ _file = _root;
+
+ stringstream ss;
+ ss << why << "." << terseCurrentTime(false) << "." << NUM++ << ".bson";
+ _file /= ss.str();
+
+ }
+
+ RemoveSaver::~RemoveSaver() {
+ if ( _out ) {
+ _out->close();
+ delete _out;
+ _out = 0;
+ }
+ }
+
+ void RemoveSaver::goingToDelete( const BSONObj& o ) {
+ if ( ! _out ) {
+ create_directories( _root );
+ _out = new ofstream();
+ _out->open( _file.string().c_str() , ios_base::out | ios_base::binary );
+ if ( ! _out->good() ) {
+ log( LL_WARNING ) << "couldn't create file: " << _file.string() << " for remove saving" << endl;
+ delete _out;
+ _out = 0;
+ return;
+ }
+
+ }
+ _out->write( o.objdata() , o.objsize() );
+ }
+
+
+} // namespace mongo
diff --git a/src/mongo/db/dbhelpers.h b/src/mongo/db/dbhelpers.h
new file mode 100644
index 00000000000..99d401fa1f8
--- /dev/null
+++ b/src/mongo/db/dbhelpers.h
@@ -0,0 +1,159 @@
+/* @file dbhelpers.h
+
+ db helpers are helper functions and classes that let us easily manipulate the local
+ database instance in-proc.
+*/
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "client.h"
+#include "db.h"
+
+namespace mongo {
+
+ const BSONObj reverseNaturalObj = BSON( "$natural" << -1 );
+
+ class Cursor;
+ class CoveredIndexMatcher;
+
+ /**
+ all helpers assume locking is handled above them
+ */
+ struct Helpers {
+
+ /* ensure the specified index exists.
+
+ @param keyPattern key pattern, e.g., { ts : 1 }
+ @param name index name, e.g., "name_1"
+
+ This method can be a little (not much) cpu-slow, so you may wish to use
+ OCCASIONALLY ensureIndex(...);
+
+ Note: use ensureHaveIdIndex() for the _id index: it is faster.
+ Note: does nothing if collection does not yet exist.
+ */
+ static void ensureIndex(const char *ns, BSONObj keyPattern, bool unique, const char *name);
+
+ /* fetch a single object from collection ns that matches query.
+ set your db SavedContext first.
+
+ @param query - the query to perform. note this is the low level portion of query so "orderby : ..."
+ won't work.
+
+ @param requireIndex if true, assert if no index for the query. a way to guard against
+ writing a slow query.
+
+ @return true if object found
+ */
+ static bool findOne(const char *ns, const BSONObj &query, BSONObj& result, bool requireIndex = false);
+ static DiskLoc findOne(const char *ns, const BSONObj &query, bool requireIndex);
+
+ /**
+ * @param foundIndex if passed in will be set to 1 if ns and index found
+ * @return true if object found
+ */
+ static bool findById(Client&, const char *ns, BSONObj query, BSONObj& result ,
+ bool * nsFound = 0 , bool * indexFound = 0 );
+
+ /* uasserts if no _id index.
+ @return null loc if not found */
+ static DiskLoc findById(NamespaceDetails *d, BSONObj query);
+
+ /** Get/put the first (or last) object from a collection. Generally only useful if the collection
+ only ever has a single object -- which is a "singleton collection".
+
+ You do not need to set the database (Context) before calling.
+
+ @return true if object exists.
+ */
+ static bool getSingleton(const char *ns, BSONObj& result);
+ static void putSingleton(const char *ns, BSONObj obj);
+ static void putSingletonGod(const char *ns, BSONObj obj, bool logTheOp);
+ static bool getFirst(const char *ns, BSONObj& result) { return getSingleton(ns, result); }
+ static bool getLast(const char *ns, BSONObj& result); // get last object int he collection; e.g. {$natural : -1}
+
+ /**
+ * you have to lock
+ * you do not have to have Context set
+ * o has to have an _id field or will assert
+ */
+ static void upsert( const string& ns , const BSONObj& o );
+
+ /** You do not need to set the database before calling.
+ @return true if collection is empty.
+ */
+ static bool isEmpty(const char *ns, bool doAuth=true);
+
+ // TODO: this should be somewhere else probably
+ static BSONObj toKeyFormat( const BSONObj& o , BSONObj& key );
+
+ class RemoveCallback {
+ public:
+ virtual ~RemoveCallback() {}
+ virtual void goingToDelete( const BSONObj& o ) = 0;
+ };
+ /* removeRange: operation is oplog'd */
+ static long long removeRange( const string& ns , const BSONObj& min , const BSONObj& max , bool yield = false , bool maxInclusive = false , RemoveCallback * callback = 0 );
+
+ /* Remove all objects from a collection.
+ You do not need to set the database before calling.
+ */
+ static void emptyCollection(const char *ns);
+
+ };
+
+ class Database;
+
+ // manage a set using collection backed storage
+ class DbSet {
+ public:
+ DbSet( const string &name = "", const BSONObj &key = BSONObj() ) :
+ name_( name ),
+ key_( key.getOwned() ) {
+ }
+ ~DbSet();
+ void reset( const string &name = "", const BSONObj &key = BSONObj() );
+ bool get( const BSONObj &obj ) const;
+ void set( const BSONObj &obj, bool val );
+ private:
+ string name_;
+ BSONObj key_;
+ };
+
+
+ /**
+ * user for saving deleted bson objects to a flat file
+ */
+ class RemoveSaver : public Helpers::RemoveCallback , boost::noncopyable {
+ public:
+ RemoveSaver( const string& type , const string& ns , const string& why);
+ ~RemoveSaver();
+
+ void goingToDelete( const BSONObj& o );
+
+ private:
+ path _root;
+ path _file;
+ ofstream* _out;
+
+ };
+
+
+} // namespace mongo
diff --git a/src/mongo/db/dbmessage.cpp b/src/mongo/db/dbmessage.cpp
new file mode 100644
index 00000000000..c86b5a05240
--- /dev/null
+++ b/src/mongo/db/dbmessage.cpp
@@ -0,0 +1,108 @@
+// dbmessage.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "dbmessage.h"
+#include "../client/dbclient.h"
+
+namespace mongo {
+
+ string Message::toString() const {
+ stringstream ss;
+ ss << "op: " << opToString( operation() ) << " len: " << size();
+ if ( operation() >= 2000 && operation() < 2100 ) {
+ DbMessage d(*this);
+ ss << " ns: " << d.getns();
+ switch ( operation() ) {
+ case dbUpdate: {
+ int flags = d.pullInt();
+ BSONObj q = d.nextJsObj();
+ BSONObj o = d.nextJsObj();
+ ss << " flags: " << flags << " query: " << q << " update: " << o;
+ break;
+ }
+ case dbInsert:
+ ss << d.nextJsObj();
+ break;
+ case dbDelete: {
+ int flags = d.pullInt();
+ BSONObj q = d.nextJsObj();
+ ss << " flags: " << flags << " query: " << q;
+ break;
+ }
+ default:
+ ss << " CANNOT HANDLE YET";
+ }
+
+
+ }
+ return ss.str();
+ }
+
+
+ void replyToQuery(int queryResultFlags,
+ AbstractMessagingPort* p, Message& requestMsg,
+ void *data, int size,
+ int nReturned, int startingFrom,
+ long long cursorId
+ ) {
+ BufBuilder b(32768);
+ b.skip(sizeof(QueryResult));
+ b.appendBuf(data, size);
+ QueryResult *qr = (QueryResult *) b.buf();
+ qr->_resultFlags() = queryResultFlags;
+ qr->len = b.len();
+ qr->setOperation(opReply);
+ qr->cursorId = cursorId;
+ qr->startingFrom = startingFrom;
+ qr->nReturned = nReturned;
+ b.decouple();
+ Message resp(qr, true);
+ p->reply(requestMsg, resp, requestMsg.header()->id);
+ }
+
+ void replyToQuery(int queryResultFlags,
+ AbstractMessagingPort* p, Message& requestMsg,
+ BSONObj& responseObj) {
+ replyToQuery(queryResultFlags,
+ p, requestMsg,
+ (void *) responseObj.objdata(), responseObj.objsize(), 1);
+ }
+
+ void replyToQuery(int queryResultFlags, Message &m, DbResponse &dbresponse, BSONObj obj) {
+ BufBuilder b;
+ b.skip(sizeof(QueryResult));
+ b.appendBuf((void*) obj.objdata(), obj.objsize());
+ QueryResult* msgdata = (QueryResult *) b.buf();
+ b.decouple();
+ QueryResult *qr = msgdata;
+ qr->_resultFlags() = queryResultFlags;
+ qr->len = b.len();
+ qr->setOperation(opReply);
+ qr->cursorId = 0;
+ qr->startingFrom = 0;
+ qr->nReturned = 1;
+ Message *resp = new Message();
+ resp->setData(msgdata, true); // transport will free
+ dbresponse.response = resp;
+ dbresponse.responseTo = m.header()->id;
+ }
+
+
+
+}
diff --git a/src/mongo/db/dbmessage.h b/src/mongo/db/dbmessage.h
new file mode 100644
index 00000000000..a789bff849c
--- /dev/null
+++ b/src/mongo/db/dbmessage.h
@@ -0,0 +1,282 @@
+// dbmessage.h
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "diskloc.h"
+#include "jsobj.h"
+#include "namespace-inl.h"
+#include "../util/net/message.h"
+#include "../client/constants.h"
+#include "instance.h"
+
+namespace mongo {
+
+ /* db response format
+
+ Query or GetMore: // see struct QueryResult
+ int resultFlags;
+ int64 cursorID;
+ int startingFrom;
+ int nReturned;
+ list of marshalled JSObjects;
+ */
+
+/* db request message format
+
+ unsigned opid; // arbitary; will be echoed back
+ byte operation;
+ int options;
+
+ then for:
+
+ dbInsert:
+ string collection;
+ a series of JSObjects
+ dbDelete:
+ string collection;
+ int flags=0; // 1=DeleteSingle
+ JSObject query;
+ dbUpdate:
+ string collection;
+ int flags; // 1=upsert
+ JSObject query;
+ JSObject objectToUpdate;
+ objectToUpdate may include { $inc: <field> } or { $set: ... }, see struct Mod.
+ dbQuery:
+ string collection;
+ int nToSkip;
+ int nToReturn; // how many you want back as the beginning of the cursor data (0=no limit)
+ // greater than zero is simply a hint on how many objects to send back per "cursor batch".
+ // a negative number indicates a hard limit.
+ JSObject query;
+ [JSObject fieldsToReturn]
+ dbGetMore:
+ string collection; // redundant, might use for security.
+ int nToReturn;
+ int64 cursorID;
+ dbKillCursors=2007:
+ int n;
+ int64 cursorIDs[n];
+
+ Note that on Update, there is only one object, which is different
+ from insert where you can pass a list of objects to insert in the db.
+ Note that the update field layout is very similar layout to Query.
+*/
+
+
+#pragma pack(1)
+ struct QueryResult : public MsgData {
+ long long cursorId;
+ int startingFrom;
+ int nReturned;
+ const char *data() {
+ return (char *) (((int *)&nReturned)+1);
+ }
+ int resultFlags() {
+ return dataAsInt();
+ }
+ int& _resultFlags() {
+ return dataAsInt();
+ }
+ void setResultFlagsToOk() {
+ _resultFlags() = ResultFlag_AwaitCapable;
+ }
+ void initializeResultFlags() {
+ _resultFlags() = 0;
+ }
+ };
+
+#pragma pack()
+
+ /* For the database/server protocol, these objects and functions encapsulate
+ the various messages transmitted over the connection.
+
+ See http://www.mongodb.org/display/DOCS/Mongo+Wire+Protocol
+ */
+ class DbMessage {
+ public:
+ DbMessage(const Message& _m) : m(_m) , mark(0) {
+ // for received messages, Message has only one buffer
+ theEnd = _m.singleData()->_data + _m.header()->dataLen();
+ char *r = _m.singleData()->_data;
+ reserved = (int *) r;
+ data = r + 4;
+ nextjsobj = data;
+ }
+
+ /** the 32 bit field before the ns
+ * track all bit usage here as its cross op
+ * 0: InsertOption_ContinueOnError
+ * 1: fromWriteback
+ */
+ int& reservedField() { return *reserved; }
+
+ const char * getns() const {
+ return data;
+ }
+ void getns(Namespace& ns) const {
+ ns = data;
+ }
+
+ const char * afterNS() const {
+ return data + strlen( data ) + 1;
+ }
+
+ int getInt( int num ) const {
+ const int * foo = (const int*)afterNS();
+ return foo[num];
+ }
+
+ int getQueryNToReturn() const {
+ return getInt( 1 );
+ }
+
+ /**
+ * get an int64 at specified offsetBytes after ns
+ */
+ long long getInt64( int offsetBytes ) const {
+ const char * x = afterNS();
+ x += offsetBytes;
+ const long long * ll = (const long long*)x;
+ return ll[0];
+ }
+
+ void resetPull() { nextjsobj = data; }
+ int pullInt() const { return pullInt(); }
+ int& pullInt() {
+ if ( nextjsobj == data )
+ nextjsobj += strlen(data) + 1; // skip namespace
+ int& i = *((int *)nextjsobj);
+ nextjsobj += 4;
+ return i;
+ }
+ long long pullInt64() const {
+ return pullInt64();
+ }
+ long long &pullInt64() {
+ if ( nextjsobj == data )
+ nextjsobj += strlen(data) + 1; // skip namespace
+ long long &i = *((long long *)nextjsobj);
+ nextjsobj += 8;
+ return i;
+ }
+
+ OID* getOID() const {
+ return (OID *) (data + strlen(data) + 1); // skip namespace
+ }
+
+ void getQueryStuff(const char *&query, int& ntoreturn) {
+ int *i = (int *) (data + strlen(data) + 1);
+ ntoreturn = *i;
+ i++;
+ query = (const char *) i;
+ }
+
+ /* for insert and update msgs */
+ bool moreJSObjs() const {
+ return nextjsobj != 0;
+ }
+ BSONObj nextJsObj() {
+ if ( nextjsobj == data ) {
+ nextjsobj += strlen(data) + 1; // skip namespace
+ massert( 13066 , "Message contains no documents", theEnd > nextjsobj );
+ }
+ massert( 10304 , "Client Error: Remaining data too small for BSON object", theEnd - nextjsobj > 3 );
+ BSONObj js(nextjsobj);
+ massert( 10305 , "Client Error: Invalid object size", js.objsize() > 3 );
+ massert( 10306 , "Client Error: Next object larger than space left in message",
+ js.objsize() < ( theEnd - data ) );
+ if ( cmdLine.objcheck && !js.valid() ) {
+ massert( 10307 , "Client Error: bad object in message", false);
+ }
+ nextjsobj += js.objsize();
+ if ( nextjsobj >= theEnd )
+ nextjsobj = 0;
+ return js;
+ }
+
+ const Message& msg() const { return m; }
+
+ void markSet() {
+ mark = nextjsobj;
+ }
+
+ void markReset() {
+ assert( mark );
+ nextjsobj = mark;
+ }
+
+ private:
+ const Message& m;
+ int* reserved;
+ const char *data;
+ const char *nextjsobj;
+ const char *theEnd;
+
+ const char * mark;
+
+ public:
+ enum ReservedOptions {
+ Reserved_InsertOption_ContinueOnError = 1 << 0 ,
+ Reserved_FromWriteback = 1 << 1
+ };
+ };
+
+
+ /* a request to run a query, received from the database */
+ class QueryMessage {
+ public:
+ const char *ns;
+ int ntoskip;
+ int ntoreturn;
+ int queryOptions;
+ BSONObj query;
+ BSONObj fields;
+
+ /* parses the message into the above fields */
+ QueryMessage(DbMessage& d) {
+ ns = d.getns();
+ ntoskip = d.pullInt();
+ ntoreturn = d.pullInt();
+ query = d.nextJsObj();
+ if ( d.moreJSObjs() ) {
+ fields = d.nextJsObj();
+ }
+ queryOptions = d.msg().header()->dataAsInt();
+ }
+ };
+
+ void replyToQuery(int queryResultFlags,
+ AbstractMessagingPort* p, Message& requestMsg,
+ void *data, int size,
+ int nReturned, int startingFrom = 0,
+ long long cursorId = 0
+ );
+
+
+ /* object reply helper. */
+ void replyToQuery(int queryResultFlags,
+ AbstractMessagingPort* p, Message& requestMsg,
+ BSONObj& responseObj);
+
+ /* helper to do a reply using a DbResponse object */
+ void replyToQuery(int queryResultFlags, Message &m, DbResponse &dbresponse, BSONObj obj);
+
+
+} // namespace mongo
diff --git a/src/mongo/db/dbwebserver.cpp b/src/mongo/db/dbwebserver.cpp
new file mode 100644
index 00000000000..eb19ba3be6c
--- /dev/null
+++ b/src/mongo/db/dbwebserver.cpp
@@ -0,0 +1,539 @@
+/* dbwebserver.cpp
+
+ This is the administrative web page displayed on port 28017.
+*/
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../util/net/miniwebserver.h"
+#include "../util/mongoutils/html.h"
+#include "../util/md5.hpp"
+#include "db.h"
+#include "instance.h"
+#include "security.h"
+#include "stats/snapshots.h"
+#include "background.h"
+#include "commands.h"
+#include "../util/version.h"
+#include "../util/ramlog.h"
+#include "pcrecpp.h"
+#include "../util/admin_access.h"
+#include "dbwebserver.h"
+#include <boost/date_time/posix_time/posix_time.hpp>
+#undef assert
+#define assert MONGO_assert
+
+namespace mongo {
+
+ using namespace mongoutils::html;
+ using namespace bson;
+
+ time_t started = time(0);
+
+ struct Timing {
+ Timing() {
+ start = timeLocked = 0;
+ }
+ unsigned long long start, timeLocked;
+ };
+
+ bool execCommand( Command * c ,
+ Client& client , int queryOptions ,
+ const char *ns, BSONObj& cmdObj ,
+ BSONObjBuilder& result,
+ bool fromRepl );
+
+ class DbWebServer : public MiniWebServer {
+ public:
+ DbWebServer(const string& ip, int port, const AdminAccess* webUsers)
+ : MiniWebServer("admin web console", ip, port), _webUsers(webUsers) {
+ WebStatusPlugin::initAll();
+ }
+
+ private:
+ const AdminAccess* _webUsers; // not owned here
+
+ void doUnlockedStuff(stringstream& ss) {
+ /* this is in the header already ss << "port: " << port << '\n'; */
+ ss << "<pre>";
+ ss << mongodVersion() << '\n';
+ ss << "git hash: " << gitVersion() << '\n';
+ ss << "sys info: " << sysInfo() << '\n';
+ ss << "uptime: " << time(0)-started << " seconds\n";
+ ss << "</pre>";
+ }
+
+ bool allowed( const char * rq , vector<string>& headers, const SockAddr &from ) {
+ if ( from.isLocalHost() || !_webUsers->haveAdminUsers() ) {
+ cmdAuthenticate.authenticate( "admin", "RestUser", false );
+ return true;
+ }
+
+ string auth = getHeader( rq , "Authorization" );
+
+ if ( auth.size() > 0 && auth.find( "Digest " ) == 0 ) {
+ auth = auth.substr( 7 ) + ", ";
+
+ map<string,string> parms;
+ pcrecpp::StringPiece input( auth );
+
+ string name, val;
+ pcrecpp::RE re("(\\w+)=\"?(.*?)\"?, ");
+ while ( re.Consume( &input, &name, &val) ) {
+ parms[name] = val;
+ }
+
+ BSONObj user = _webUsers->getAdminUser( parms["username"] );
+ if ( ! user.isEmpty() ) {
+ string ha1 = user["pwd"].str();
+ string ha2 = md5simpledigest( (string)"GET" + ":" + parms["uri"] );
+
+ stringstream r;
+ r << ha1 << ':' << parms["nonce"];
+ if ( parms["nc"].size() && parms["cnonce"].size() && parms["qop"].size() ) {
+ r << ':';
+ r << parms["nc"];
+ r << ':';
+ r << parms["cnonce"];
+ r << ':';
+ r << parms["qop"];
+ }
+ r << ':';
+ r << ha2;
+ string r1 = md5simpledigest( r.str() );
+
+ if ( r1 == parms["response"] ) {
+ cmdAuthenticate.authenticate( "admin", user["user"].str(), user[ "readOnly" ].isBoolean() && user[ "readOnly" ].boolean() );
+ return true;
+ }
+ }
+ }
+
+ stringstream authHeader;
+ authHeader
+ << "WWW-Authenticate: "
+ << "Digest realm=\"mongo\", "
+ << "nonce=\"abc\", "
+ << "algorithm=MD5, qop=\"auth\" "
+ ;
+
+ headers.push_back( authHeader.str() );
+ return 0;
+ }
+
+ virtual void doRequest(
+ const char *rq, // the full request
+ string url,
+ // set these and return them:
+ string& responseMsg,
+ int& responseCode,
+ vector<string>& headers, // if completely empty, content-type: text/html will be added
+ const SockAddr &from
+ ) {
+ if ( url.size() > 1 ) {
+
+ if ( ! allowed( rq , headers, from ) ) {
+ responseCode = 401;
+ headers.push_back( "Content-Type: text/plain;charset=utf-8" );
+ responseMsg = "not allowed\n";
+ return;
+ }
+
+ {
+ BSONObj params;
+ const size_t pos = url.find( "?" );
+ if ( pos != string::npos ) {
+ MiniWebServer::parseParams( params , url.substr( pos + 1 ) );
+ url = url.substr(0, pos);
+ }
+
+ DbWebHandler * handler = DbWebHandler::findHandler( url );
+ if ( handler ) {
+ if ( handler->requiresREST( url ) && ! cmdLine.rest ) {
+ _rejectREST( responseMsg , responseCode , headers );
+ }
+ else {
+ string callback = params.getStringField("jsonp");
+ uassert(13453, "server not started with --jsonp", callback.empty() || cmdLine.jsonp);
+
+ handler->handle( rq , url , params , responseMsg , responseCode , headers , from );
+
+ if (responseCode == 200 && !callback.empty()) {
+ responseMsg = callback + '(' + responseMsg + ')';
+ }
+ }
+ return;
+ }
+ }
+
+
+ if ( ! cmdLine.rest ) {
+ _rejectREST( responseMsg , responseCode , headers );
+ return;
+ }
+
+ responseCode = 404;
+ headers.push_back( "Content-Type: text/html;charset=utf-8" );
+ responseMsg = "<html><body>unknown url</body></html>\n";
+ return;
+ }
+
+ // generate home page
+
+ if ( ! allowed( rq , headers, from ) ) {
+ responseCode = 401;
+ headers.push_back( "Content-Type: text/plain;charset=utf-8" );
+ responseMsg = "not allowed\n";
+ return;
+ }
+
+ responseCode = 200;
+ stringstream ss;
+ string dbname;
+ {
+ stringstream z;
+ z << cmdLine.binaryName << ' ' << prettyHostName();
+ dbname = z.str();
+ }
+ ss << start(dbname) << h2(dbname);
+ ss << "<p><a href=\"/_commands\">List all commands</a> | \n";
+ ss << "<a href=\"/_replSet\">Replica set status</a></p>\n";
+
+ //ss << "<a href=\"/_status\">_status</a>";
+ {
+ const map<string, Command*> *m = Command::webCommands();
+ if( m ) {
+ ss <<
+ a("",
+ "These read-only context-less commands can be executed from the web interface. "
+ "Results are json format, unless ?text=1 is appended in which case the result is output as text "
+ "for easier human viewing",
+ "Commands")
+ << ": ";
+ for( map<string, Command*>::const_iterator i = m->begin(); i != m->end(); i++ ) {
+ stringstream h;
+ i->second->help(h);
+ string help = h.str();
+ ss << "<a href=\"/" << i->first << "?text=1\"";
+ if( help != "no help defined" )
+ ss << " title=\"" << help << '"';
+ ss << ">" << i->first << "</a> ";
+ }
+ ss << '\n';
+ }
+ }
+ ss << '\n';
+ /*
+ ss << "HTTP <a "
+ "title=\"click for documentation on this http interface\""
+ "href=\"http://www.mongodb.org/display/DOCS/Http+Interface\">admin port</a>:" << _port << "<p>\n";
+ */
+
+ doUnlockedStuff(ss);
+
+ WebStatusPlugin::runAll( ss );
+
+ ss << "</body></html>\n";
+ responseMsg = ss.str();
+ headers.push_back( "Content-Type: text/html;charset=utf-8" );
+ }
+
+ void _rejectREST( string& responseMsg , int& responseCode, vector<string>& headers ) {
+ responseCode = 403;
+ stringstream ss;
+ ss << "REST is not enabled. use --rest to turn on.\n";
+ ss << "check that port " << _port << " is secured for the network too.\n";
+ responseMsg = ss.str();
+ headers.push_back( "Content-Type: text/plain;charset=utf-8" );
+ }
+
+ };
+ // ---
+
+ bool prisort( const Prioritizable * a , const Prioritizable * b ) {
+ return a->priority() < b->priority();
+ }
+
+ // -- status framework ---
+ WebStatusPlugin::WebStatusPlugin( const string& secionName , double priority , const string& subheader )
+ : Prioritizable(priority), _name( secionName ) , _subHeading( subheader ) {
+ if ( ! _plugins )
+ _plugins = new vector<WebStatusPlugin*>();
+ _plugins->push_back( this );
+ }
+
+ void WebStatusPlugin::initAll() {
+ if ( ! _plugins )
+ return;
+
+ sort( _plugins->begin(), _plugins->end() , prisort );
+
+ for ( unsigned i=0; i<_plugins->size(); i++ )
+ (*_plugins)[i]->init();
+ }
+
+ void WebStatusPlugin::runAll( stringstream& ss ) {
+ if ( ! _plugins )
+ return;
+
+ for ( unsigned i=0; i<_plugins->size(); i++ ) {
+ WebStatusPlugin * p = (*_plugins)[i];
+ ss << "<hr>\n"
+ << "<b>" << p->_name << "</b>";
+
+ ss << " " << p->_subHeading;
+
+ ss << "<br>\n";
+
+ p->run(ss);
+ }
+
+ }
+
+ vector<WebStatusPlugin*> * WebStatusPlugin::_plugins = 0;
+
+ // -- basic statuc plugins --
+
+ class LogPlugin : public WebStatusPlugin {
+ public:
+ LogPlugin() : WebStatusPlugin( "Log" , 100 ), _log(0) {
+ }
+
+ virtual void init() {
+ _log = RamLog::get( "global" );
+ if ( ! _log ) {
+ _log = new RamLog("global");
+ Logstream::get().addGlobalTee( _log );
+ }
+ }
+
+ virtual void run( stringstream& ss ) {
+ _log->toHTML( ss );
+ }
+ RamLog * _log;
+ };
+
+ LogPlugin * logPlugin = new LogPlugin();
+
+ // -- handler framework ---
+
+ DbWebHandler::DbWebHandler( const string& name , double priority , bool requiresREST )
+ : Prioritizable(priority), _name(name) , _requiresREST(requiresREST) {
+
+ {
+ // setup strings
+ _defaultUrl = "/";
+ _defaultUrl += name;
+
+ stringstream ss;
+ ss << name << " priority: " << priority << " rest: " << requiresREST;
+ _toString = ss.str();
+ }
+
+ {
+ // add to handler list
+ if ( ! _handlers )
+ _handlers = new vector<DbWebHandler*>();
+ _handlers->push_back( this );
+ sort( _handlers->begin() , _handlers->end() , prisort );
+ }
+ }
+
+ DbWebHandler * DbWebHandler::findHandler( const string& url ) {
+ if ( ! _handlers )
+ return 0;
+
+ for ( unsigned i=0; i<_handlers->size(); i++ ) {
+ DbWebHandler * h = (*_handlers)[i];
+ if ( h->handles( url ) )
+ return h;
+ }
+
+ return 0;
+ }
+
+ vector<DbWebHandler*> * DbWebHandler::_handlers = 0;
+
+ // --- basic handlers ---
+
+ class FavIconHandler : public DbWebHandler {
+ public:
+ FavIconHandler() : DbWebHandler( "favicon.ico" , 0 , false ) {}
+
+ virtual void handle( const char *rq, string url, BSONObj params,
+ string& responseMsg, int& responseCode,
+ vector<string>& headers, const SockAddr &from ) {
+ responseCode = 404;
+ headers.push_back( "Content-Type: text/plain;charset=utf-8" );
+ responseMsg = "no favicon\n";
+ }
+
+ } faviconHandler;
+
+ class StatusHandler : public DbWebHandler {
+ public:
+ StatusHandler() : DbWebHandler( "_status" , 1 , false ) {}
+
+ virtual void handle( const char *rq, string url, BSONObj params,
+ string& responseMsg, int& responseCode,
+ vector<string>& headers, const SockAddr &from ) {
+ headers.push_back( "Content-Type: application/json;charset=utf-8" );
+ responseCode = 200;
+
+ static vector<string> commands;
+ if ( commands.size() == 0 ) {
+ commands.push_back( "serverStatus" );
+ commands.push_back( "buildinfo" );
+ }
+
+ BSONObjBuilder buf(1024);
+
+ for ( unsigned i=0; i<commands.size(); i++ ) {
+ string cmd = commands[i];
+
+ Command * c = Command::findCommand( cmd );
+ assert( c );
+ assert( c->locktype() == 0 );
+
+ BSONObj co;
+ {
+ BSONObjBuilder b;
+ b.append( cmd , 1 );
+
+ if ( cmd == "serverStatus" && params["repl"].type() ) {
+ b.append( "repl" , atoi( params["repl"].valuestr() ) );
+ }
+
+ co = b.obj();
+ }
+
+ string errmsg;
+
+ BSONObjBuilder sub;
+ if ( ! c->run( "admin.$cmd" , co , 0, errmsg , sub , false ) )
+ buf.append( cmd , errmsg );
+ else
+ buf.append( cmd , sub.obj() );
+ }
+
+ responseMsg = buf.obj().jsonString();
+
+ }
+
+ } statusHandler;
+
+ class CommandListHandler : public DbWebHandler {
+ public:
+ CommandListHandler() : DbWebHandler( "_commands" , 1 , true ) {}
+
+ virtual void handle( const char *rq, string url, BSONObj params,
+ string& responseMsg, int& responseCode,
+ vector<string>& headers, const SockAddr &from ) {
+ headers.push_back( "Content-Type: text/html;charset=utf-8" );
+ responseCode = 200;
+
+ stringstream ss;
+ ss << start("Commands List");
+ ss << p( a("/", "back", "Home") );
+ ss << p( "<b>MongoDB List of <a href=\"http://www.mongodb.org/display/DOCS/Commands\">Commands</a></b>\n" );
+ const map<string, Command*> *m = Command::commandsByBestName();
+ ss << "S:slave-ok R:read-lock W:write-lock A:admin-only<br>\n";
+ ss << table();
+ ss << "<tr><th>Command</th><th>Attributes</th><th>Help</th></tr>\n";
+ for( map<string, Command*>::const_iterator i = m->begin(); i != m->end(); i++ )
+ i->second->htmlHelp(ss);
+ ss << _table() << _end();
+
+ responseMsg = ss.str();
+ }
+ } commandListHandler;
+
+ class CommandsHandler : public DbWebHandler {
+ public:
+ CommandsHandler() : DbWebHandler( "DUMMY COMMANDS" , 2 , true ) {}
+
+ bool _cmd( const string& url , string& cmd , bool& text, bo params ) const {
+ cmd = str::after(url, '/');
+ text = params["text"].boolean();
+ return true;
+ }
+
+ Command * _cmd( const string& cmd ) const {
+ const map<string,Command*> *m = Command::webCommands();
+ if( ! m )
+ return 0;
+
+ map<string,Command*>::const_iterator i = m->find(cmd);
+ if ( i == m->end() )
+ return 0;
+
+ return i->second;
+ }
+
+ virtual bool handles( const string& url ) const {
+ string cmd;
+ bool text;
+ if ( ! _cmd( url , cmd , text, bo() ) )
+ return false;
+ return _cmd(cmd) != 0;
+ }
+
+ virtual void handle( const char *rq, string url, BSONObj params,
+ string& responseMsg, int& responseCode,
+ vector<string>& headers, const SockAddr &from ) {
+ string cmd;
+ bool text = false;
+ assert( _cmd( url , cmd , text, params ) );
+ Command * c = _cmd( cmd );
+ assert( c );
+
+ BSONObj cmdObj = BSON( cmd << 1 );
+ Client& client = cc();
+
+ BSONObjBuilder result;
+ execCommand(c, client, 0, "admin.", cmdObj , result, false);
+
+ responseCode = 200;
+
+ string j = result.done().jsonString(Strict, text );
+ responseMsg = j;
+
+ if( text ) {
+ headers.push_back( "Content-Type: text/plain;charset=utf-8" );
+ responseMsg += '\n';
+ }
+ else {
+ headers.push_back( "Content-Type: application/json;charset=utf-8" );
+ }
+
+ }
+
+ } commandsHandler;
+
+ // --- external ----
+
+ void webServerThread(const AdminAccess* adminAccess) {
+ boost::scoped_ptr<const AdminAccess> adminAccessPtr(adminAccess); // adminAccess is owned here
+ Client::initThread("websvr");
+ const int p = cmdLine.port + 1000;
+ DbWebServer mini(cmdLine.bind_ip, p, adminAccessPtr.get());
+ mini.initAndListen();
+ cc().shutdown();
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/dbwebserver.h b/src/mongo/db/dbwebserver.h
new file mode 100644
index 00000000000..bdbcba2c07d
--- /dev/null
+++ b/src/mongo/db/dbwebserver.h
@@ -0,0 +1,85 @@
+/** @file dbwebserver.h
+ */
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "../util/admin_access.h"
+
+namespace mongo {
+
+ class Prioritizable {
+ public:
+ Prioritizable( double p ) : _priority(p) {}
+ double priority() const { return _priority; }
+ private:
+ double _priority;
+ };
+
+ class DbWebHandler : public Prioritizable {
+ public:
+ DbWebHandler( const string& name , double priority , bool requiresREST );
+ virtual ~DbWebHandler() {}
+
+ virtual bool handles( const string& url ) const { return url == _defaultUrl; }
+
+ virtual bool requiresREST( const string& url ) const { return _requiresREST; }
+
+ virtual void handle( const char *rq, // the full request
+ string url,
+ BSONObj params,
+ // set these and return them:
+ string& responseMsg,
+ int& responseCode,
+ vector<string>& headers, // if completely empty, content-type: text/html will be added
+ const SockAddr &from
+ ) = 0;
+
+ string toString() const { return _toString; }
+ static DbWebHandler * findHandler( const string& url );
+
+ private:
+ string _name;
+ bool _requiresREST;
+
+ string _defaultUrl;
+ string _toString;
+
+ static vector<DbWebHandler*> * _handlers;
+ };
+
+ class WebStatusPlugin : public Prioritizable {
+ public:
+ WebStatusPlugin( const string& secionName , double priority , const string& subheader = "" );
+ virtual ~WebStatusPlugin() {}
+
+ virtual void run( stringstream& ss ) = 0;
+ /** called when web server stats up */
+ virtual void init() = 0;
+
+ static void initAll();
+ static void runAll( stringstream& ss );
+ private:
+ string _name;
+ string _subHeading;
+ static vector<WebStatusPlugin*> * _plugins;
+
+ };
+
+ void webServerThread( const AdminAccess* admins );
+ string prettyHostName();
+
+};
diff --git a/src/mongo/db/diskloc.h b/src/mongo/db/diskloc.h
new file mode 100644
index 00000000000..5295df3e260
--- /dev/null
+++ b/src/mongo/db/diskloc.h
@@ -0,0 +1,160 @@
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* @file diskloc.h
+
+ Storage subsystem management.
+ Lays out our datafiles on disk, manages disk space.
+*/
+
+#pragma once
+
+#include "jsobj.h"
+
+namespace mongo {
+
+ class Record;
+ class DeletedRecord;
+ class Extent;
+ class MongoDataFile;
+ class DiskLoc;
+
+ template< class Version > class BtreeBucket;
+
+#pragma pack(1)
+ /** represents a disk location/offset on disk in a database. 64 bits.
+ it is assumed these will be passed around by value a lot so don't do anything to make them large
+ (such as adding a virtual function)
+ */
+ class DiskLoc {
+ int _a; // this will be volume, file #, etsc. but is a logical value could be anything depending on storage engine
+ int ofs;
+
+ public:
+
+ enum SentinelValues {
+ /* note NullOfs is different. todo clean up. see refs to NullOfs in code - use is valid but outside DiskLoc context so confusing as-is. */
+ NullOfs = -1,
+ MaxFiles=16000 // thus a limit of about 32TB of data per db
+ };
+
+ DiskLoc(int a, int Ofs) : _a(a), ofs(Ofs) { }
+ DiskLoc() { Null(); }
+ DiskLoc(const DiskLoc& l) {
+ _a=l._a;
+ ofs=l.ofs;
+ }
+
+ bool questionable() const {
+ return ofs < -1 ||
+ _a < -1 ||
+ _a > 524288;
+ }
+
+ bool isNull() const { return _a == -1; }
+ void Null() {
+ _a = -1;
+ ofs = 0; /* note NullOfs is different. todo clean up. see refs to NullOfs in code - use is valid but outside DiskLoc context so confusing as-is. */
+ }
+ void assertOk() { assert(!isNull()); }
+ void setInvalid() {
+ _a = -2;
+ ofs = 0;
+ }
+ bool isValid() const { return _a != -2; }
+
+ string toString() const {
+ if ( isNull() )
+ return "null";
+ stringstream ss;
+ ss << hex << _a << ':' << ofs;
+ return ss.str();
+ }
+
+ BSONObj toBSONObj() const { return BSON( "file" << _a << "offset" << ofs ); }
+
+ int a() const { return _a; }
+
+ int& GETOFS() { return ofs; }
+ int getOfs() const { return ofs; }
+ void set(int a, int b) {
+ _a=a;
+ ofs=b;
+ }
+
+ void inc(int amt) {
+ assert( !isNull() );
+ ofs += amt;
+ }
+
+ bool sameFile(DiskLoc b) {
+ return _a== b._a;
+ }
+
+ bool operator==(const DiskLoc& b) const {
+ return _a==b._a&& ofs == b.ofs;
+ }
+ bool operator!=(const DiskLoc& b) const {
+ return !(*this==b);
+ }
+ const DiskLoc& operator=(const DiskLoc& b) {
+ _a=b._a;
+ ofs = b.ofs;
+ //assert(ofs!=0);
+ return *this;
+ }
+ int compare(const DiskLoc& b) const {
+ int x = _a - b._a;
+ if ( x )
+ return x;
+ return ofs - b.ofs;
+ }
+ bool operator<(const DiskLoc& b) const {
+ return compare(b) < 0;
+ }
+
+ /**
+ * Marks this disk loc for writing
+ * @returns a non const reference to this disk loc
+ * This function explicitly signals we are writing and casts away const
+ */
+ DiskLoc& writing() const; // see dur.h
+
+ /* Get the "thing" associated with this disk location.
+ it is assumed the object is what you say it is -- you must assure that
+ (think of this as an unchecked type cast)
+ Note: set your Context first so that the database to which the diskloc applies is known.
+ */
+ BSONObj obj() const;
+ Record* rec() const;
+ DeletedRecord* drec() const;
+ Extent* ext() const;
+
+ template< class V >
+ const BtreeBucket<V> * btree() const;
+
+ // Explicitly signals we are writing and casts away const
+ template< class V >
+ BtreeBucket<V> * btreemod() const;
+
+ /*MongoDataFile& pdf() const;*/
+ };
+#pragma pack()
+
+ const DiskLoc minDiskLoc(0, 1);
+ const DiskLoc maxDiskLoc(0x7fffffff, 0x7fffffff);
+
+} // namespace mongo
diff --git a/src/mongo/db/driverHelpers.cpp b/src/mongo/db/driverHelpers.cpp
new file mode 100644
index 00000000000..12aa01886c4
--- /dev/null
+++ b/src/mongo/db/driverHelpers.cpp
@@ -0,0 +1,62 @@
+// driverHelpers.cpp
+
+/**
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ this file has dbcommands that are for drivers
+ mostly helpers
+*/
+
+
+#include "pch.h"
+#include "jsobj.h"
+#include "pdfile.h"
+#include "namespace-inl.h"
+#include "commands.h"
+#include "cmdline.h"
+#include "btree.h"
+#include "curop-inl.h"
+#include "../util/background.h"
+#include "../scripting/engine.h"
+
+namespace mongo {
+
+ class BasicDriverHelper : public Command {
+ public:
+ BasicDriverHelper( const char * name ) : Command( name ) {}
+
+ virtual LockType locktype() const { return NONE; }
+ virtual bool slaveOk() const { return true; }
+ virtual bool slaveOverrideOk() { return true; }
+ };
+
+ class ObjectIdTest : public BasicDriverHelper {
+ public:
+ ObjectIdTest() : BasicDriverHelper( "driverOIDTest" ) {}
+ virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ if ( cmdObj.firstElement().type() != jstOID ) {
+ errmsg = "not oid";
+ return false;
+ }
+
+ const OID& oid = cmdObj.firstElement().__oid();
+ result.append( "oid" , oid );
+ result.append( "str" , oid.str() );
+
+ return true;
+ }
+ } driverObjectIdTest;
+}
diff --git a/src/mongo/db/dur.cpp b/src/mongo/db/dur.cpp
new file mode 100644
index 00000000000..822fa5232c0
--- /dev/null
+++ b/src/mongo/db/dur.cpp
@@ -0,0 +1,840 @@
+// @file dur.cpp durability in the storage engine (crash-safeness / journaling)
+
+/**
+* Copyright (C) 2009 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+ phases:
+
+ PREPLOGBUFFER
+ we will build an output buffer ourself and then use O_DIRECT
+ we could be in read lock for this
+ for very large objects write directly to redo log in situ?
+ WRITETOJOURNAL
+ we could be unlocked (the main db lock that is...) for this, with sufficient care, but there is some complexity
+ have to handle falling behind which would use too much ram (going back into a read lock would suffice to stop that).
+ for now (1.7.5/1.8.0) we are in read lock which is not ideal.
+ WRITETODATAFILES
+ apply the writes back to the non-private MMF after they are for certain in redo log
+ REMAPPRIVATEVIEW
+ we could in a write lock quickly flip readers back to the main view, then stay in read lock and do our real
+ remapping. with many files (e.g., 1000), remapping could be time consuming (several ms), so we don't want
+ to be too frequent.
+ there could be a slow down immediately after remapping as fresh copy-on-writes for commonly written pages will
+ be required. so doing these remaps fractionally is helpful.
+
+ mutexes:
+
+ READLOCK dbMutex
+ LOCK groupCommitMutex
+ PREPLOGBUFFER()
+ READLOCK mmmutex
+ commitJob.reset()
+ UNLOCK dbMutex // now other threads can write
+ WRITETOJOURNAL()
+ WRITETODATAFILES()
+ UNLOCK mmmutex
+ UNLOCK groupCommitMutex
+
+ on the next write lock acquisition for dbMutex: // see MongoMutex::_acquiredWriteLock()
+ REMAPPRIVATEVIEW()
+
+ @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc
+*/
+
+#include "pch.h"
+#include "cmdline.h"
+#include "client.h"
+#include "dur.h"
+#include "dur_journal.h"
+#include "dur_commitjob.h"
+#include "dur_recover.h"
+#include "dur_stats.h"
+#include "../util/concurrency/race.h"
+#include "../util/mongoutils/hash.h"
+#include "../util/mongoutils/str.h"
+#include "../util/timer.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+
+ namespace dur {
+
+ void PREPLOGBUFFER(JSectHeader& outParm);
+ void WRITETOJOURNAL(JSectHeader h, AlignedBuilder& uncompressed);
+ void WRITETODATAFILES(const JSectHeader& h, AlignedBuilder& uncompressed);
+
+ /** declared later in this file
+ only used in this file -- use DurableInterface::commitNow() outside
+ */
+ static void groupCommit();
+
+ CommitJob& commitJob = *(new CommitJob()); // don't destroy
+
+ Stats stats;
+
+ void Stats::S::reset() {
+ memset(this, 0, sizeof(*this));
+ }
+
+ Stats::Stats() {
+ _a.reset();
+ _b.reset();
+ curr = &_a;
+ _intervalMicros = 3000000;
+ }
+
+ Stats::S * Stats::other() {
+ return curr == &_a ? &_b : &_a;
+ }
+ string _CSVHeader();
+
+ string Stats::S::_CSVHeader() {
+ return "cmts jrnMB\twrDFMB\tcIWLk\tearly\tprpLgB wrToJ\twrToDF\trmpPrVw";
+ }
+
+ string Stats::S::_asCSV() {
+ stringstream ss;
+ ss <<
+ setprecision(2) <<
+ _commits << '\t' << fixed <<
+ _journaledBytes / 1000000.0 << '\t' <<
+ _writeToDataFilesBytes / 1000000.0 << '\t' <<
+ _commitsInWriteLock << '\t' <<
+ _earlyCommits << '\t' <<
+ (unsigned) (_prepLogBufferMicros/1000) << '\t' <<
+ (unsigned) (_writeToJournalMicros/1000) << '\t' <<
+ (unsigned) (_writeToDataFilesMicros/1000) << '\t' <<
+ (unsigned) (_remapPrivateViewMicros/1000);
+ return ss.str();
+ }
+
+ //int getAgeOutJournalFiles();
+ BSONObj Stats::S::_asObj() {
+ BSONObjBuilder b;
+ b <<
+ "commits" << _commits <<
+ "journaledMB" << _journaledBytes / 1000000.0 <<
+ "writeToDataFilesMB" << _writeToDataFilesBytes / 1000000.0 <<
+ "compression" << _journaledBytes / (_uncompressedBytes+1.0) <<
+ "commitsInWriteLock" << _commitsInWriteLock <<
+ "earlyCommits" << _earlyCommits <<
+ "timeMs" <<
+ BSON( "dt" << _dtMillis <<
+ "prepLogBuffer" << (unsigned) (_prepLogBufferMicros/1000) <<
+ "writeToJournal" << (unsigned) (_writeToJournalMicros/1000) <<
+ "writeToDataFiles" << (unsigned) (_writeToDataFilesMicros/1000) <<
+ "remapPrivateView" << (unsigned) (_remapPrivateViewMicros/1000)
+ );
+ /*int r = getAgeOutJournalFiles();
+ if( r == -1 )
+ b << "ageOutJournalFiles" << "mutex timeout";
+ if( r == 0 )
+ b << "ageOutJournalFiles" << false;*/
+ if( cmdLine.journalCommitInterval != 0 )
+ b << "journalCommitIntervalMs" << cmdLine.journalCommitInterval;
+ return b.obj();
+ }
+
+ BSONObj Stats::asObj() {
+ return other()->_asObj();
+ }
+
+ void Stats::rotate() {
+ unsigned long long now = curTimeMicros64();
+ unsigned long long dt = now - _lastRotate;
+ if( dt >= _intervalMicros && _intervalMicros ) {
+ // rotate
+ curr->_dtMillis = (unsigned) (dt/1000);
+ _lastRotate = now;
+ curr = other();
+ curr->reset();
+ }
+ }
+
+ void NonDurableImpl::setNoJournal(void *dst, void *src, unsigned len) {
+ memcpy(dst, src, len);
+ }
+
+ void DurableImpl::setNoJournal(void *dst, void *src, unsigned len) {
+ // we are at least read locked, so we need not worry about REMAPPRIVATEVIEW herein.
+ DEV d.dbMutex.assertAtLeastReadLocked();
+
+ MemoryMappedFile::makeWritable(dst, len);
+
+ // we enter the RecoveryJob mutex here, so that if WRITETODATAFILES is happening we do not
+ // conflict with it
+ scoped_lock lk1( RecoveryJob::get()._mx );
+
+ // we stay in this mutex for everything to work with DurParanoid/validateSingleMapMatches
+ //
+ // either of these mutexes also makes setNoJournal threadsafe, which is good as we call it from a read
+ // (not a write) lock in class SlaveTracking
+ //
+ scoped_lock lk( privateViews._mutex() );
+
+ size_t ofs;
+ MongoMMF *f = privateViews.find_inlock(dst, ofs);
+ assert(f);
+ void *w = (((char *)f->view_write())+ofs);
+ // first write it to the writable (file) view
+ memcpy(w, src, len);
+ if( memcmp(w, dst, len) ) {
+ // if we get here, a copy-on-write had previously occurred. so write it to the private view too
+ // to keep them in sync. we do this as we do not want to cause a copy on write unnecessarily.
+ memcpy(dst, src, len);
+ }
+ }
+
+ /** base declare write intent function that all the helpers call. */
+ void DurableImpl::declareWriteIntent(void *p, unsigned len) {
+ commitJob.note(p, len);
+ }
+
+ static DurableImpl* durableImpl = new DurableImpl();
+ static NonDurableImpl* nonDurableImpl = new NonDurableImpl();
+ DurableInterface* DurableInterface::_impl = nonDurableImpl;
+
+ void DurableInterface::enableDurability() {
+ assert(_impl == nonDurableImpl);
+ _impl = durableImpl;
+ }
+
+ void DurableInterface::disableDurability() {
+ assert(_impl == durableImpl);
+ massert(13616, "can't disable durability with pending writes", !commitJob.hasWritten());
+ _impl = nonDurableImpl;
+ }
+
+ bool DurableImpl::commitNow() {
+ stats.curr->_earlyCommits++;
+ groupCommit();
+ return true;
+ }
+
+ bool DurableImpl::awaitCommit() {
+ commitJob._notify.awaitBeyondNow();
+ return true;
+ }
+
+ /** Declare that a file has been created
+ Normally writes are applied only after journaling, for safety. But here the file
+ is created first, and the journal will just replay the creation if the create didn't
+ happen because of crashing.
+ */
+ void DurableImpl::createdFile(string filename, unsigned long long len) {
+ shared_ptr<DurOp> op( new FileCreatedOp(filename, len) );
+ commitJob.noteOp(op);
+ }
+
+ void* DurableImpl::writingPtr(void *x, unsigned len) {
+ void *p = x;
+ declareWriteIntent(p, len);
+ return p;
+ }
+
+ /** declare intent to write
+ @param ofs offset within buf at which we will write
+ @param len the length at ofs we will write
+ @return new buffer pointer.
+ */
+ void* DurableImpl::writingAtOffset(void *buf, unsigned ofs, unsigned len) {
+ char *p = (char *) buf;
+ declareWriteIntent(p+ofs, len);
+ return p;
+ }
+
+ void* DurableImpl::writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges ) {
+ char *p = (char *) buf;
+ for( vector< pair< long long, unsigned > >::const_iterator i = ranges.begin();
+ i != ranges.end(); ++i ) {
+ declareWriteIntent( p + i->first, i->second );
+ }
+ return p;
+ }
+
+ bool DurableImpl::aCommitIsNeeded() const {
+ DEV commitJob._nSinceCommitIfNeededCall = 0;
+ return commitJob.bytes() > UncommittedBytesLimit;
+ }
+
+ bool DurableImpl::commitIfNeeded() {
+ if ( !d.dbMutex.isWriteLocked() )
+ return false;
+
+ DEV commitJob._nSinceCommitIfNeededCall = 0;
+ if (commitJob.bytes() > UncommittedBytesLimit) { // should this also fire if CmdLine::DurAlwaysCommit?
+ stats.curr->_earlyCommits++;
+ groupCommit();
+ return true;
+ }
+ return false;
+ }
+
+ /** Used in _DEBUG builds to check that we didn't overwrite the last intent
+ that was declared. called just before writelock release. we check a few
+ bytes after the declared region to see if they changed.
+
+ @see MongoMutex::_releasedWriteLock
+
+ SLOW
+ */
+#if 0
+ void DurableImpl::debugCheckLastDeclaredWrite() {
+ static int n;
+ ++n;
+
+ assert(debug && cmdLine.dur);
+ if (commitJob.writes().empty())
+ return;
+ const WriteIntent &i = commitJob.lastWrite();
+ size_t ofs;
+ MongoMMF *mmf = privateViews.find(i.start(), ofs);
+ if( mmf == 0 )
+ return;
+ size_t past = ofs + i.length();
+ if( mmf->length() < past + 8 )
+ return; // too close to end of view
+ char *priv = (char *) mmf->getView();
+ char *writ = (char *) mmf->view_write();
+ unsigned long long *a = (unsigned long long *) (priv+past);
+ unsigned long long *b = (unsigned long long *) (writ+past);
+ if( *a != *b ) {
+ for( set<WriteIntent>::iterator it(commitJob.writes().begin()), end((commitJob.writes().begin())); it != end; ++it ) {
+ const WriteIntent& wi = *it;
+ char *r1 = (char*) wi.start();
+ char *r2 = (char*) wi.end();
+ if( r1 <= (((char*)a)+8) && r2 > (char*)a ) {
+ //log() << "it's ok " << wi.p << ' ' << wi.len << endl;
+ return;
+ }
+ }
+ log() << "journal data after write area " << i.start() << " does not agree" << endl;
+ log() << " was: " << ((void*)b) << " " << hexdump((char*)b, 8) << endl;
+ log() << " now: " << ((void*)a) << " " << hexdump((char*)a, 8) << endl;
+ log() << " n: " << n << endl;
+ log() << endl;
+ }
+ }
+#endif
+
+ // Functor to be called over all MongoFiles
+
+ class validateSingleMapMatches {
+ public:
+ validateSingleMapMatches(unsigned long long& bytes) :_bytes(bytes) {}
+ void operator () (MongoFile *mf) {
+ if( mf->isMongoMMF() ) {
+ MongoMMF *mmf = (MongoMMF*) mf;
+ const unsigned char *p = (const unsigned char *) mmf->getView();
+ const unsigned char *w = (const unsigned char *) mmf->view_write();
+
+ if (!p || !w) return; // File not fully opened yet
+
+ _bytes += mmf->length();
+
+ assert( mmf->length() == (unsigned) mmf->length() );
+ {
+ scoped_lock lk( privateViews._mutex() ); // see setNoJournal
+ if (memcmp(p, w, (unsigned) mmf->length()) == 0)
+ return; // next file
+ }
+
+ unsigned low = 0xffffffff;
+ unsigned high = 0;
+ log() << "DurParanoid mismatch in " << mmf->filename() << endl;
+ int logged = 0;
+ unsigned lastMismatch = 0xffffffff;
+ for( unsigned i = 0; i < mmf->length(); i++ ) {
+ if( p[i] != w[i] ) {
+ if( lastMismatch != 0xffffffff && lastMismatch+1 != i )
+ log() << endl; // separate blocks of mismatches
+ lastMismatch= i;
+ if( ++logged < 60 ) {
+ if( logged == 1 )
+ log() << "ofs % 628 = 0x" << hex << (i%628) << endl; // for .ns files to find offset in record
+ stringstream ss;
+ ss << "mismatch ofs:" << hex << i << "\tfilemap:" << setw(2) << (unsigned) w[i] << "\tprivmap:" << setw(2) << (unsigned) p[i];
+ if( p[i] > 32 && p[i] <= 126 )
+ ss << '\t' << p[i];
+ log() << ss.str() << endl;
+ }
+ if( logged == 60 )
+ log() << "..." << endl;
+ if( i < low ) low = i;
+ if( i > high ) high = i;
+ }
+ }
+ if( low != 0xffffffff ) {
+ std::stringstream ss;
+ ss << "journal error warning views mismatch " << mmf->filename() << ' ' << (hex) << low << ".." << high << " len:" << high-low+1;
+ log() << ss.str() << endl;
+ log() << "priv loc: " << (void*)(p+low) << ' ' << endl;
+ set<WriteIntent>& b = commitJob.writes();
+ (void)b; // mark as unused. Useful for inspection in debugger
+
+ // should we abort() here so this isn't unnoticed in some circumstances?
+ massert(13599, "Written data does not match in-memory view. Missing WriteIntent?", false);
+ }
+ }
+ }
+ private:
+ unsigned long long& _bytes;
+ };
+
+ /** (SLOW) diagnostic to check that the private view and the non-private view are in sync.
+ */
+ void debugValidateAllMapsMatch() {
+ if( ! (cmdLine.durOptions & CmdLine::DurParanoid) )
+ return;
+
+ unsigned long long bytes = 0;
+ Timer t;
+ MongoFile::forEach(validateSingleMapMatches(bytes));
+ OCCASIONALLY log() << "DurParanoid map check " << t.millis() << "ms for " << (bytes / (1024*1024)) << "MB" << endl;
+ }
+
+ extern size_t privateMapBytes;
+
+ static void _REMAPPRIVATEVIEW() {
+ // todo: Consider using ProcessInfo herein and watching for getResidentSize to drop. that could be a way
+ // to assure very good behavior here.
+
+ static unsigned startAt;
+ static unsigned long long lastRemap;
+
+ LOG(4) << "journal REMAPPRIVATEVIEW" << endl;
+
+ d.dbMutex.assertWriteLocked();
+ d.dbMutex._remapPrivateViewRequested = false;
+ assert( !commitJob.hasWritten() );
+
+ // we want to remap all private views about every 2 seconds. there could be ~1000 views so
+ // we do a little each pass; beyond the remap time, more significantly, there will be copy on write
+ // faults after remapping, so doing a little bit at a time will avoid big load spikes on
+ // remapping.
+ unsigned long long now = curTimeMicros64();
+ double fraction = (now-lastRemap)/2000000.0;
+ if( cmdLine.durOptions & CmdLine::DurAlwaysRemap )
+ fraction = 1;
+ lastRemap = now;
+
+ LockMongoFilesShared lk;
+ set<MongoFile*>& files = MongoFile::getAllFiles();
+ unsigned sz = files.size();
+ if( sz == 0 )
+ return;
+
+ {
+ // be careful not to use too much memory if the write rate is
+ // extremely high
+ double f = privateMapBytes / ((double)UncommittedBytesLimit);
+ if( f > fraction ) {
+ fraction = f;
+ }
+ privateMapBytes = 0;
+ }
+
+ unsigned ntodo = (unsigned) (sz * fraction);
+ if( ntodo < 1 ) ntodo = 1;
+ if( ntodo > sz ) ntodo = sz;
+
+ const set<MongoFile*>::iterator b = files.begin();
+ const set<MongoFile*>::iterator e = files.end();
+ set<MongoFile*>::iterator i = b;
+ // skip to our starting position
+ for( unsigned x = 0; x < startAt; x++ ) {
+ i++;
+ if( i == e ) i = b;
+ }
+ unsigned startedAt = startAt;
+ startAt = (startAt + ntodo) % sz; // mark where to start next time
+
+ Timer t;
+ for( unsigned x = 0; x < ntodo; x++ ) {
+ dassert( i != e );
+ if( (*i)->isMongoMMF() ) {
+ MongoMMF *mmf = (MongoMMF*) *i;
+ assert(mmf);
+ if( mmf->willNeedRemap() ) {
+ mmf->willNeedRemap() = false;
+ mmf->remapThePrivateView();
+ }
+ i++;
+ if( i == e ) i = b;
+ }
+ }
+ LOG(2) << "journal REMAPPRIVATEVIEW done startedAt: " << startedAt << " n:" << ntodo << ' ' << t.millis() << "ms" << endl;
+ }
+
+ /** We need to remap the private views periodically. otherwise they would become very large.
+ Call within write lock. See top of file for more commentary.
+ */
+ void REMAPPRIVATEVIEW() {
+ Timer t;
+ _REMAPPRIVATEVIEW();
+ stats.curr->_remapPrivateViewMicros += t.micros();
+ }
+
+ // lock order: dbMutex first, then this
+ mutex groupCommitMutex("groupCommit");
+
+ bool _groupCommitWithLimitedLocks() {
+
+ int p = 0;
+ LOG(4) << "groupcommitll " << p++ << endl;
+
+ scoped_ptr<ExcludeAllWrites> lk1( new ExcludeAllWrites() );
+
+ LOG(4) << "groupcommitll " << p++ << endl;
+
+ scoped_lock lk2(groupCommitMutex);
+
+ LOG(4) << "groupcommitll " << p++ << endl;
+
+ commitJob.beginCommit();
+
+ if( !commitJob.hasWritten() ) {
+ // getlasterror request could have came after the data was already committed
+ commitJob.notifyCommitted();
+ return true;
+ }
+
+ LOG(4) << "groupcommitll " << p++ << endl;
+
+ JSectHeader h;
+ PREPLOGBUFFER(h); // need to be in readlock (writes excluded) for this
+
+ LOG(4) << "groupcommitll " << p++ << endl;
+
+ LockMongoFilesShared lk3;
+
+ LOG(4) << "groupcommitll " << p++ << endl;
+
+ unsigned abLen = commitJob._ab.len();
+ commitJob.reset(); // must be reset before allowing anyone to write
+ DEV assert( !commitJob.hasWritten() );
+
+ LOG(4) << "groupcommitll " << p++ << endl;
+
+ // release the readlock -- allowing others to now write while we are writing to the journal (etc.)
+ lk1.reset();
+
+ LOG(4) << "groupcommitll " << p++ << endl;
+
+ // ****** now other threads can do writes ******
+
+ WRITETOJOURNAL(h, commitJob._ab);
+ assert( abLen == commitJob._ab.len() ); // a check that no one touched the builder while we were doing work. if so, our locking is wrong.
+
+ LOG(4) << "groupcommitll " << p++ << endl;
+
+ // data is now in the journal, which is sufficient for acknowledging getLastError.
+ // (ok to crash after that)
+ commitJob.notifyCommitted();
+
+ LOG(4) << "groupcommitll " << p++ << " WRITETODATAFILES()" << endl;
+
+ WRITETODATAFILES(h, commitJob._ab);
+ assert( abLen == commitJob._ab.len() ); // check again wasn't modded
+ commitJob._ab.reset();
+
+ LOG(4) << "groupcommitll " << p++ << endl;
+
+ // can't : d.dbMutex._remapPrivateViewRequested = true;
+
+ return true;
+ }
+
+ /** @return true if committed; false if lock acquisition timed out (we only try for a read lock herein and only wait for a certain duration). */
+ bool groupCommitWithLimitedLocks() {
+ try {
+ return _groupCommitWithLimitedLocks();
+ }
+ catch(DBException& e ) {
+ log() << "dbexception in groupCommitLL causing immediate shutdown: " << e.toString() << endl;
+ mongoAbort("dur1");
+ }
+ catch(std::ios_base::failure& e) {
+ log() << "ios_base exception in groupCommitLL causing immediate shutdown: " << e.what() << endl;
+ mongoAbort("dur2");
+ }
+ catch(std::bad_alloc& e) {
+ log() << "bad_alloc exception in groupCommitLL causing immediate shutdown: " << e.what() << endl;
+ mongoAbort("dur3");
+ }
+ catch(std::exception& e) {
+ log() << "exception in dur::groupCommitLL causing immediate shutdown: " << e.what() << endl;
+ mongoAbort("dur4");
+ }
+ return false;
+ }
+
+ static void _groupCommit() {
+
+ LOG(4) << "_groupCommit " << endl;
+
+ // we need to be at least read locked on the dbMutex so that we know the write intent data
+ // structures are not changing while we work
+ d.dbMutex.assertAtLeastReadLocked();
+
+ commitJob.beginCommit();
+
+ if( !commitJob.hasWritten() ) {
+ // getlasterror request could have came after the data was already committed
+ commitJob.notifyCommitted();
+ return;
+ }
+
+ // we need to make sure two group commits aren't running at the same time
+ // (and we are only read locked in the dbMutex, so it could happen)
+ scoped_lock lk(groupCommitMutex);
+
+ JSectHeader h;
+ PREPLOGBUFFER(h);
+
+ // todo : write to the journal outside locks, as this write can be slow.
+ // however, be careful then about remapprivateview as that cannot be done
+ // if new writes are then pending in the private maps.
+ WRITETOJOURNAL(h, commitJob._ab);
+
+ // data is now in the journal, which is sufficient for acknowledging getLastError.
+ // (ok to crash after that)
+ commitJob.notifyCommitted();
+
+ WRITETODATAFILES(h, commitJob._ab);
+ debugValidateAllMapsMatch();
+
+ commitJob.reset();
+ commitJob._ab.reset();
+
+ // REMAPPRIVATEVIEW
+ //
+ // remapping private views must occur after WRITETODATAFILES otherwise
+ // we wouldn't see newly written data on reads.
+ //
+ DEV assert( !commitJob.hasWritten() );
+ if( !d.dbMutex.isWriteLocked() ) {
+ // this needs done in a write lock (as there is a short window during remapping when each view
+ // might not exist) thus we do it on the next acquisition of that instead of here (there is no
+ // rush if you aren't writing anyway -- but it must happen, if it is done, before any uncommitted
+ // writes occur). If desired, perhaps this can be eliminated on posix as it may be that the remap
+ // is race-free there.
+ //
+ d.dbMutex._remapPrivateViewRequested = true;
+ }
+ else {
+ stats.curr->_commitsInWriteLock++;
+ // however, if we are already write locked, we must do it now -- up the call tree someone
+ // may do a write without a new lock acquisition. this can happen when MongoMMF::close() calls
+ // this method when a file (and its views) is about to go away.
+ //
+ REMAPPRIVATEVIEW();
+ }
+ }
+
+ /** locking: in read lock when called
+ or, for early commits (commitIfNeeded), in write lock
+ @see MongoMMF::close()
+ */
+ static void groupCommit() {
+ try {
+ _groupCommit();
+ }
+ catch(DBException& e ) {
+ log() << "dbexception in groupCommit causing immediate shutdown: " << e.toString() << endl;
+ mongoAbort("gc1");
+ }
+ catch(std::ios_base::failure& e) {
+ log() << "ios_base exception in groupCommit causing immediate shutdown: " << e.what() << endl;
+ mongoAbort("gc2");
+ }
+ catch(std::bad_alloc& e) {
+ log() << "bad_alloc exception in groupCommit causing immediate shutdown: " << e.what() << endl;
+ mongoAbort("gc3");
+ }
+ catch(std::exception& e) {
+ log() << "exception in dur::groupCommit causing immediate shutdown: " << e.what() << endl;
+ mongoAbort("gc4");
+ }
+ LOG(4) << "groupCommit end" << endl;
+ }
+
+ static void go() {
+ const int N = 10;
+ static int n;
+ if( privateMapBytes < UncommittedBytesLimit && ++n % N && (cmdLine.durOptions&CmdLine::DurAlwaysRemap)==0 ) {
+ // limited locks version doesn't do any remapprivateview at all, so only try this if privateMapBytes
+ // is in an acceptable range. also every Nth commit, we do everything so we can do some remapping;
+ // remapping a lot all at once could cause jitter from a large amount of copy-on-writes all at once.
+ if( groupCommitWithLimitedLocks() )
+ return;
+ }
+ else {
+ readlocktry lk("", 1000);
+ if( lk.got() ) {
+ groupCommit();
+ return;
+ }
+ }
+
+ // starvation on read locks could occur. so if read lock acquisition is slow, try to get a
+ // write lock instead. otherwise journaling could be delayed too long (too much data will
+ // not accumulate though, as commitIfNeeded logic will have executed in the meantime if there
+ // has been writes)
+ writelock lk;
+ groupCommit();
+ }
+
+ /** called when a MongoMMF is closing -- we need to go ahead and group commit in that case before its
+ views disappear
+ */
+ void closingFileNotification() {
+ if (!cmdLine.dur)
+ return;
+
+ if( d.dbMutex.atLeastReadLocked() ) {
+ groupCommit();
+ }
+ else {
+ assert( inShutdown() );
+ if( commitJob.hasWritten() ) {
+ log() << "journal warning files are closing outside locks with writes pending" << endl;
+ }
+ }
+ }
+
+ extern int groupCommitIntervalMs;
+ boost::filesystem::path getJournalDir();
+
+ void durThread() {
+ Client::initThread("journal");
+
+ bool samePartition = true;
+ try {
+ const string dbpathDir = boost::filesystem::path(dbpath).native_directory_string();
+ samePartition = onSamePartition(getJournalDir().string(), dbpathDir);
+ }
+ catch(...) {
+ }
+
+ while( !inShutdown() ) {
+ RACECHECK
+
+ unsigned ms = cmdLine.journalCommitInterval;
+ if( ms == 0 ) {
+ // use default
+ ms = samePartition ? 100 : 30;
+ }
+
+ unsigned oneThird = (ms / 3) + 1; // +1 so never zero
+
+ try {
+ stats.rotate();
+
+ // we do this in a couple blocks (the invoke()), which makes it a tiny bit faster (only a little) on throughput,
+ // but is likely also less spiky on our cpu usage, which is good.
+
+ // commit sooner if one or more getLastError j:true is pending
+ sleepmillis(oneThird);
+ for( unsigned i = 1; i <= 2; i++ ) {
+ if( commitJob._notify.nWaiting() )
+ break;
+ commitJob.wi()._deferred.invoke();
+ sleepmillis(oneThird);
+ }
+
+ go();
+ }
+ catch(std::exception& e) {
+ log() << "exception in durThread causing immediate shutdown: " << e.what() << endl;
+ mongoAbort("exception in durThread");
+ }
+ }
+ cc().shutdown();
+ }
+
+ void recover();
+
+ unsigned notesThisLock = 0;
+
+ void releasingWriteLock() {
+ DEV notesThisLock = 0;
+ // implicit commitIfNeeded check on each write unlock
+ DEV commitJob._nSinceCommitIfNeededCall = 0; // implicit commit if needed
+ if( commitJob.bytes() > UncommittedBytesLimit || cmdLine.durOptions & CmdLine::DurAlwaysCommit ) {
+ stats.curr->_earlyCommits++;
+ groupCommit();
+ }
+ }
+
+ void preallocateFiles();
+
+ /** at startup, recover, and then start the journal threads */
+ void startup() {
+ if( !cmdLine.dur )
+ return;
+
+#if defined(_DURABLEDEFAULTON)
+ DEV {
+ if( time(0) & 1 ) {
+ cmdLine.durOptions |= CmdLine::DurAlwaysCommit;
+ log() << "_DEBUG _DURABLEDEFAULTON : forcing DurAlwaysCommit mode for this run" << endl;
+ }
+ if( time(0) & 2 ) {
+ cmdLine.durOptions |= CmdLine::DurAlwaysRemap;
+ log() << "_DEBUG _DURABLEDEFAULTON : forcing DurAlwaysRemap mode for this run" << endl;
+ }
+ }
+#endif
+
+ DurableInterface::enableDurability();
+
+ journalMakeDir();
+ try {
+ recover();
+ }
+ catch(...) {
+ log() << "exception during recovery" << endl;
+ throw;
+ }
+
+ preallocateFiles();
+
+ boost::thread t(durThread);
+ }
+
+ void DurableImpl::syncDataAndTruncateJournal() {
+ d.dbMutex.assertWriteLocked();
+
+ // a commit from the commit thread won't begin while we are in the write lock,
+ // but it may already be in progress and the end of that work is done outside
+ // (dbMutex) locks. This line waits for that to complete if already underway.
+ {
+ scoped_lock lk(groupCommitMutex);
+ }
+
+ groupCommit();
+ MongoFile::flushAll(true);
+ journalCleanup();
+
+ assert(!haveJournalFiles()); // Double check post-conditions
+ }
+
+ } // namespace dur
+
+} // namespace mongo
diff --git a/src/mongo/db/dur.h b/src/mongo/db/dur.h
new file mode 100644
index 00000000000..f06ff500195
--- /dev/null
+++ b/src/mongo/db/dur.h
@@ -0,0 +1,209 @@
+// @file dur.h durability support
+
+#pragma once
+
+#include "diskloc.h"
+#include "mongommf.h"
+
+namespace mongo {
+
+ class NamespaceDetails;
+
+ void mongoAbort(const char *msg);
+ void abort(); // not defined -- use mongoAbort() instead
+
+ namespace dur {
+
+ // a smaller limit is likely better on 32 bit
+#if defined(__i386__) || defined(_M_IX86)
+ const unsigned UncommittedBytesLimit = 50 * 1024 * 1024;
+#else
+ const unsigned UncommittedBytesLimit = 100 * 1024 * 1024;
+#endif
+
+ /** Call during startup so durability module can initialize
+ Throws if fatal error
+ Does nothing if cmdLine.dur is false
+ */
+ void startup();
+
+ class DurableInterface : boost::noncopyable {
+ public:
+ virtual ~DurableInterface() { log() << "ERROR warning ~DurableInterface not intended to be called" << endl; }
+
+ /** Declare that a file has been created
+ Normally writes are applied only after journaling, for safety. But here the file
+ is created first, and the journal will just replay the creation if the create didn't
+ happen because of crashing.
+ */
+ virtual void createdFile(string filename, unsigned long long len) = 0;
+
+ /** Declarations of write intent.
+
+ Use these methods to declare "i'm about to write to x and it should be logged for redo."
+
+ Failure to call writing...() is checked in _DEBUG mode by using a read only mapped view
+ (i.e., you'll segfault if the code is covered in that situation). The _DEBUG check doesn't
+ verify that your length is correct though.
+ */
+
+ /** declare intent to write to x for up to len
+ @return pointer where to write. this is modified when testIntent is true.
+ */
+ virtual void* writingPtr(void *x, unsigned len) = 0;
+
+ /** declare write intent; should already be in the write view to work correctly when testIntent is true.
+ if you aren't, use writingPtr() instead.
+ */
+ virtual void declareWriteIntent(void *x, unsigned len) = 0;
+
+ /** declare intent to write
+ @param ofs offset within buf at which we will write
+ @param len the length at ofs we will write
+ @return new buffer pointer. this is modified when testIntent is true.
+ */
+ virtual void* writingAtOffset(void *buf, unsigned ofs, unsigned len) = 0;
+
+ /** declare intent to write
+ @param ranges vector of pairs representing ranges. Each pair
+ comprises an offset from buf where a range begins, then the
+ range length.
+ @return new buffer pointer. this is modified when testIntent is true.
+ */
+ virtual void* writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges ) = 0;
+
+ /** Wait for acknowledgement of the next group commit.
+ @return true if --dur is on. There will be delay.
+ @return false if --dur is off.
+ */
+ virtual bool awaitCommit() = 0;
+
+ /** Commit immediately.
+
+ Generally, you do not want to do this often, as highly granular committing may affect
+ performance.
+
+ Does not return until the commit is complete.
+
+ You must be at least read locked when you call this. Ideally, you are not write locked
+ and then read operations can occur concurrently.
+
+ @return true if --dur is on.
+ @return false if --dur is off. (in which case there is action)
+ */
+ virtual bool commitNow() = 0;
+
+ /** Commit if enough bytes have been modified. Current threshold is 50MB
+
+ The idea is that long running write operations that dont yield
+ (like creating an index or update with $atomic) can call this
+ whenever the db is in a sane state and it will prevent commits
+ from growing too large.
+ @return true if commited
+ */
+ virtual bool commitIfNeeded() = 0;
+
+ /** @return true if time to commit but does NOT do a commit */
+ virtual bool aCommitIsNeeded() const = 0;
+
+ /** Declare write intent for a DiskLoc. @see DiskLoc::writing() */
+ inline DiskLoc& writingDiskLoc(DiskLoc& d) { return *((DiskLoc*) writingPtr(&d, sizeof(d))); }
+
+ /** Declare write intent for an int */
+ inline int& writingInt(const int& d) { return *((int*) writingPtr((int*) &d, sizeof(d))); }
+
+ /** "assume i've already indicated write intent, let me write"
+ redeclaration is fine too, but this is faster.
+ */
+ template <typename T>
+ inline
+ T* alreadyDeclared(T *x) {
+#if defined(_TESTINTENT)
+ return (T*) MongoMMF::switchToPrivateView(x);
+#else
+ return x;
+#endif
+ }
+
+ /** declare intent to write to x for sizeof(*x) */
+ template <typename T>
+ inline
+ T* writing(T *x) {
+ return (T*) writingPtr(x, sizeof(T));
+ }
+
+ /** write something that doesn't have to be journaled, as this write is "unimportant".
+ a good example is paddingFactor.
+ can be thought of as memcpy(dst,src,len)
+ the dur implementation acquires a mutex in this method, so do not assume it is faster
+ without measuring!
+ */
+ virtual void setNoJournal(void *dst, void *src, unsigned len) = 0;
+
+ /** Commits pending changes, flushes all changes to main data
+ files, then removes the journal.
+
+ This is useful as a "barrier" to ensure that writes before this
+ call will never go through recovery and be applied to files
+ that have had changes made after this call applied.
+ */
+ virtual void syncDataAndTruncateJournal() = 0;
+
+ static DurableInterface& getDur() { return *_impl; }
+
+ private:
+ /** Intentionally unimplemented method.
+ It's very easy to manipulate Record::data open ended. Thus a call to writing(Record*) is suspect.
+ This will override the templated version and yield an unresolved external.
+ */
+ Record* writing(Record* r);
+ /** Intentionally unimplemented method. BtreeBuckets are allocated in buffers larger than sizeof( BtreeBucket ). */
+// BtreeBucket* writing( BtreeBucket* );
+ /** Intentionally unimplemented method. NamespaceDetails may be based on references to 'Extra' objects. */
+ NamespaceDetails* writing( NamespaceDetails* );
+
+ static DurableInterface* _impl; // NonDurableImpl at startup()
+ static void enableDurability(); // makes _impl a DurableImpl
+ static void disableDurability(); // makes _impl a NonDurableImpl
+
+ // these need to be able to enable/disable Durability
+ friend void startup();
+ friend class TempDisableDurability;
+ }; // class DurableInterface
+
+ class NonDurableImpl : public DurableInterface {
+ void* writingPtr(void *x, unsigned len) { return x; }
+ void* writingAtOffset(void *buf, unsigned ofs, unsigned len) { return buf; }
+ void* writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges) { return buf; }
+ void declareWriteIntent(void *, unsigned) { }
+ void createdFile(string filename, unsigned long long len) { }
+ bool awaitCommit() { return false; }
+ bool commitNow() { return false; }
+ bool commitIfNeeded() { return false; }
+ bool aCommitIsNeeded() const { return false; }
+ void setNoJournal(void *dst, void *src, unsigned len);
+ void syncDataAndTruncateJournal() {}
+ };
+
+ class DurableImpl : public DurableInterface {
+ void* writingPtr(void *x, unsigned len);
+ void* writingAtOffset(void *buf, unsigned ofs, unsigned len);
+ void* writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges);
+ void declareWriteIntent(void *, unsigned);
+ void createdFile(string filename, unsigned long long len);
+ bool awaitCommit();
+ bool commitNow();
+ bool aCommitIsNeeded() const;
+ bool commitIfNeeded();
+ void setNoJournal(void *dst, void *src, unsigned len);
+ void syncDataAndTruncateJournal();
+ };
+
+ } // namespace dur
+
+ inline dur::DurableInterface& getDur() { return dur::DurableInterface::getDur(); }
+
+ /** declare that we are modifying a diskloc and this is a datafile write. */
+ inline DiskLoc& DiskLoc::writing() const { return getDur().writingDiskLoc(*const_cast< DiskLoc * >( this )); }
+
+}
diff --git a/src/mongo/db/dur_commitjob.cpp b/src/mongo/db/dur_commitjob.cpp
new file mode 100644
index 00000000000..5a9e9cb5679
--- /dev/null
+++ b/src/mongo/db/dur_commitjob.cpp
@@ -0,0 +1,240 @@
+/* @file dur_commitjob.cpp */
+
+/**
+* Copyright (C) 2009 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "dur_commitjob.h"
+#include "dur_stats.h"
+#include "taskqueue.h"
+#include "client.h"
+
+namespace mongo {
+
+ namespace dur {
+
+ BOOST_STATIC_ASSERT( UncommittedBytesLimit > BSONObjMaxInternalSize * 3 );
+ BOOST_STATIC_ASSERT( sizeof(void*)==4 || UncommittedBytesLimit > BSONObjMaxInternalSize * 6 );
+
+ void Writes::D::go(const Writes::D& d) {
+ commitJob.wi()._insertWriteIntent(d.p, d.len);
+ }
+
+ void WriteIntent::absorb(const WriteIntent& other) {
+ dassert(overlaps(other));
+
+ void* newStart = min(start(), other.start());
+ p = max(p, other.p);
+ len = (char*)p - (char*)newStart;
+
+ dassert(contains(other));
+ }
+
+ void Writes::clear() {
+ d.dbMutex.assertAtLeastReadLocked();
+
+ _alreadyNoted.clear();
+ _writes.clear();
+ _ops.clear();
+ _drained = false;
+#if defined(DEBUG_WRITE_INTENT)
+ cout << "_debug clear\n";
+ _debug.clear();
+#endif
+ }
+
+#if defined(DEBUG_WRITE_INTENT)
+ void assertAlreadyDeclared(void *p, int len) {
+ if( commitJob.wi()._debug[p] >= len )
+ return;
+ log() << "assertAlreadyDeclared fails " << (void*)p << " len:" << len << ' ' << commitJob.wi()._debug[p] << endl;
+ printStackTrace();
+ abort();
+ }
+#endif
+
+ void Writes::_insertWriteIntent(void* p, int len) {
+ WriteIntent wi(p, len);
+
+ if (_writes.empty()) {
+ _writes.insert(wi);
+ return;
+ }
+
+ typedef set<WriteIntent>::const_iterator iterator; // shorter
+
+ iterator closest = _writes.lower_bound(wi);
+ // closest.end() >= wi.end()
+
+ if ((closest != _writes.end() && closest->overlaps(wi)) || // high end
+ (closest != _writes.begin() && (--closest)->overlaps(wi))) { // low end
+ if (closest->contains(wi))
+ return; // nothing to do
+
+ // find overlapping range and merge into wi
+ iterator end(closest);
+ iterator begin(closest);
+ while ( end->overlaps(wi)) { wi.absorb(*end); ++end; if (end == _writes.end()) break; } // look forwards
+ while (begin->overlaps(wi)) { wi.absorb(*begin); if (begin == _writes.begin()) break; --begin; } // look backwards
+ if (!begin->overlaps(wi)) ++begin; // make inclusive
+
+ DEV { // ensure we're not deleting anything we shouldn't
+ for (iterator it(begin); it != end; ++it) {
+ assert(wi.contains(*it));
+ }
+ }
+
+ _writes.erase(begin, end);
+ _writes.insert(wi);
+
+ DEV { // ensure there are no overlaps
+ // this can be very slow - n^2 - so make it RARELY
+ RARELY {
+ for (iterator it(_writes.begin()), end(boost::prior(_writes.end())); it != end; ++it) {
+ assert(!it->overlaps(*boost::next(it)));
+ }
+ }
+ }
+ }
+ else { // no entries overlapping wi
+ _writes.insert(closest, wi);
+ }
+ }
+
+ /** note an operation other than a "basic write" */
+ void CommitJob::noteOp(shared_ptr<DurOp> p) {
+ d.dbMutex.assertWriteLocked();
+ dassert( cmdLine.dur );
+ cc()._hasWrittenThisPass = true;
+ if( !_hasWritten ) {
+ assert( !d.dbMutex._remapPrivateViewRequested );
+ _hasWritten = true;
+ }
+ _wi._ops.push_back(p);
+ }
+
+ size_t privateMapBytes = 0; // used by _REMAPPRIVATEVIEW to track how much / how fast to remap
+
+ void CommitJob::beginCommit() {
+ DEV d.dbMutex.assertAtLeastReadLocked();
+ _commitNumber = _notify.now();
+ stats.curr->_commits++;
+ }
+
+ void CommitJob::reset() {
+ _hasWritten = false;
+ _wi.clear();
+ privateMapBytes += _bytes;
+ _bytes = 0;
+ _nSinceCommitIfNeededCall = 0;
+ }
+
+ CommitJob::CommitJob() : _ab(4 * 1024 * 1024) , _hasWritten(false),
+ _bytes(0), _nSinceCommitIfNeededCall(0) {
+ _commitNumber = 0;
+ }
+
+ extern unsigned notesThisLock;
+
+ void CommitJob::note(void* p, int len) {
+ // from the point of view of the dur module, it would be fine (i think) to only
+ // be read locked here. but must be at least read locked to avoid race with
+ // remapprivateview
+ DEV notesThisLock++;
+ DEV d.dbMutex.assertWriteLocked();
+ dassert( cmdLine.dur );
+ cc()._hasWrittenThisPass = true;
+ if( !_wi._alreadyNoted.checkAndSet(p, len) ) {
+ MemoryMappedFile::makeWritable(p, len);
+
+ if( !_hasWritten ) {
+ // you can't be writing if one of these is pending, so this is a verification.
+ assert( !d.dbMutex._remapPrivateViewRequested ); // safe to assert here since it must be the first write in a write lock
+
+ // we don't bother doing a group commit when nothing is written, so we have a var to track that
+ _hasWritten = true;
+ }
+
+ /** tips for debugging:
+ if you have an incorrect diff between data files in different folders
+ (see jstests/dur/quick.js for example),
+ turn this on and see what is logged. if you have a copy of its output from before the
+ regression, a simple diff of these lines would tell you a lot likely.
+ */
+#if 0 && defined(_DEBUG)
+ {
+ static int n;
+ if( ++n < 10000 ) {
+ size_t ofs;
+ MongoMMF *mmf = privateViews._find(w.p, ofs);
+ if( mmf ) {
+ log() << "DEBUG note write intent " << w.p << ' ' << mmf->filename() << " ofs:" << hex << ofs << " len:" << w.len << endl;
+ }
+ else {
+ log() << "DEBUG note write intent " << w.p << ' ' << w.len << " NOT FOUND IN privateViews" << endl;
+ }
+ }
+ else if( n == 10000 ) {
+ log() << "DEBUG stopping write intent logging, too much to log" << endl;
+ }
+ }
+#endif
+
+ // remember intent. we will journal it in a bit
+ _wi.insertWriteIntent(p, len);
+ wassert( _wi._writes.size() < 2000000 );
+ //assert( _wi._writes.size() < 20000000 );
+
+ {
+ // a bit over conservative in counting pagebytes used
+ static size_t lastPos; // note this doesn't reset with each commit, but that is ok we aren't being that precise
+ size_t x = ((size_t) p) & ~0xfff; // round off to page address (4KB)
+ if( x != lastPos ) {
+ lastPos = x;
+ unsigned b = (len+4095) & ~0xfff;
+ _bytes += b;
+#if defined(_DEBUG)
+ _nSinceCommitIfNeededCall++;
+ if( _nSinceCommitIfNeededCall >= 80 ) {
+ if( _nSinceCommitIfNeededCall % 40 == 0 ) {
+ log() << "debug nsincecommitifneeded:" << _nSinceCommitIfNeededCall << " bytes:" << _bytes << endl;
+ if( _nSinceCommitIfNeededCall == 120 || _nSinceCommitIfNeededCall == 1200 ) {
+ log() << "_DEBUG printing stack given high nsinccommitifneeded number" << endl;
+ printStackTrace();
+ }
+ }
+ }
+#endif
+ if (_bytes > UncommittedBytesLimit * 3) {
+ static time_t lastComplain;
+ static unsigned nComplains;
+ // throttle logging
+ if( ++nComplains < 100 || time(0) - lastComplain >= 60 ) {
+ lastComplain = time(0);
+ warning() << "DR102 too much data written uncommitted " << _bytes/1000000.0 << "MB" << endl;
+ if( nComplains < 10 || nComplains % 10 == 0 ) {
+ // wassert makes getLastError show an error, so we just print stack trace
+ printStackTrace();
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ }
+}
diff --git a/src/mongo/db/dur_commitjob.h b/src/mongo/db/dur_commitjob.h
new file mode 100644
index 00000000000..bfc5e3c268f
--- /dev/null
+++ b/src/mongo/db/dur_commitjob.h
@@ -0,0 +1,220 @@
+/* @file dur_commitjob.h used by dur.cpp
+*/
+
+/**
+* Copyright (C) 2009 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../util/alignedbuilder.h"
+#include "../util/mongoutils/hash.h"
+#include "../util/concurrency/synchronization.h"
+#include "cmdline.h"
+#include "durop.h"
+#include "dur.h"
+#include "taskqueue.h"
+
+//#define DEBUG_WRITE_INTENT 1
+
+namespace mongo {
+ namespace dur {
+
+ /** declaration of an intent to write to a region of a memory mapped view
+ *
+ * We store the end rather than the start pointer to make operator< faster
+ * since that is heavily used in set lookup.
+ */
+ struct WriteIntent { /* copyable */
+ WriteIntent() : /*w_ptr(0), */ p(0) { }
+ WriteIntent(void *a, unsigned b) : /*w_ptr(0), */ p((char*)a+b), len(b) { }
+
+ void* start() const { return (char*)p - len; }
+ void* end() const { return p; }
+ unsigned length() const { return len; }
+
+ bool operator < (const WriteIntent& rhs) const { return end() < rhs.end(); }
+
+ // can they be merged?
+ bool overlaps(const WriteIntent& rhs) const {
+ return (start() <= rhs.end() && end() >= rhs.start());
+ }
+
+ // is merging necessary?
+ bool contains(const WriteIntent& rhs) const {
+ return (start() <= rhs.start() && end() >= rhs.end());
+ }
+
+ // merge into me
+ void absorb(const WriteIntent& other);
+
+ friend ostream& operator << (ostream& out, const WriteIntent& wi) {
+ return (out << "p: " << wi.p << " end: " << wi.end() << " len: " << wi.len);
+ }
+
+ //mutable void *w_ptr; // writable mapping of p.
+ // mutable because set::iterator is const but this isn't used in op<
+#if defined(_EXPERIMENTAL)
+ mutable unsigned ofsInJournalBuffer;
+#endif
+ private:
+ void *p; // intent to write up to p
+ unsigned len; // up to this len
+ };
+
+ /** try to remember things we have already marked for journaling. false negatives are ok if infrequent -
+ we will just log them twice.
+ */
+ template<int Prime>
+ class Already : boost::noncopyable {
+ public:
+ Already() { clear(); }
+ void clear() { memset(this, 0, sizeof(*this)); }
+
+ /* see if we have Already recorded/indicated our write intent for this region of memory.
+ automatically upgrades the length if the length was shorter previously.
+ @return true if already indicated.
+ */
+ bool checkAndSet(void* p, int len) {
+ unsigned x = mongoutils::hashPointer(p);
+ pair<void*, int>& nd = nodes[x % N];
+ if( nd.first == p ) {
+ if( nd.second < len ) {
+ nd.second = len;
+ return false; // haven't indicated this len yet
+ }
+ return true; // already indicated
+ }
+ nd.first = p;
+ nd.second = len;
+ return false; // a new set
+ }
+
+ private:
+ enum { N = Prime }; // this should be small the idea is that it fits in the cpu cache easily
+ pair<void*,int> nodes[N];
+ };
+
+ /** our record of pending/uncommitted write intents */
+ class Writes : boost::noncopyable {
+ struct D {
+ void *p;
+ unsigned len;
+ static void go(const D& d);
+ };
+ public:
+ TaskQueue<D> _deferred;
+ Already<127> _alreadyNoted;
+ set<WriteIntent> _writes;
+ vector< shared_ptr<DurOp> > _ops; // all the ops other than basic writes
+ bool _drained; // _deferred is drained? for asserting/testing
+
+ /** reset the Writes structure (empties all the above) */
+ void clear();
+
+ /** merges into set (ie non-deferred version) */
+ void _insertWriteIntent(void* p, int len);
+
+ void insertWriteIntent(void* p, int len) {
+#if defined(DEBUG_WRITE_INTENT)
+ if( _debug[p] < len )
+ _debug[p] = len;
+#endif
+ D d;
+ d.p = p;
+ d.len = len;
+ _deferred.defer(d);
+ }
+
+#ifdef _DEBUG
+ WriteIntent _last;
+#endif
+#if defined(DEBUG_WRITE_INTENT)
+ map<void*,int> _debug;
+#endif
+ };
+
+#if defined(DEBUG_WRITE_INTENT)
+ void assertAlreadyDeclared(void *, int len);
+#else
+ inline void assertAlreadyDeclared(void *, int len) { }
+#endif
+
+ /** A commit job object for a group commit. Currently there is one instance of this object.
+
+ concurrency: assumption is caller is appropriately locking.
+ for example note() invocations are from the write lock.
+ other uses are in a read lock from a single thread (durThread)
+ */
+ class CommitJob : boost::noncopyable {
+ public:
+ AlignedBuilder _ab; // for direct i/o writes to journal
+
+ CommitJob();
+
+ ~CommitJob(){ assert(!"shouldn't destroy CommitJob!"); }
+
+ /** record/note an intent to write */
+ void note(void* p, int len);
+
+ /** note an operation other than a "basic write" */
+ void noteOp(shared_ptr<DurOp> p);
+
+ set<WriteIntent>& writes() {
+ if( !_wi._drained ) {
+ // generally, you don't want to use the set until it is prepared (after deferred ops are applied)
+ // thus this assert here.
+ assert(false);
+ }
+ return _wi._writes;
+ }
+
+ vector< shared_ptr<DurOp> >& ops() { return _wi._ops; }
+
+ /** this method is safe to call outside of locks. when haswritten is false we don't do any group commit and avoid even
+ trying to acquire a lock, which might be helpful at times.
+ */
+ bool hasWritten() const { return _hasWritten; }
+
+ /** we use the commitjob object over and over, calling reset() rather than reconstructing */
+ void reset();
+
+ void beginCommit();
+
+ /** the commit code calls this when data reaches the journal (on disk) */
+ void notifyCommitted() { _notify.notifyAll(_commitNumber); }
+
+ /** we check how much written and if it is getting to be a lot, we commit sooner. */
+ size_t bytes() const { return _bytes; }
+
+#if defined(_DEBUG)
+ const WriteIntent& lastWrite() const { return _wi._last; }
+#endif
+
+ Writes& wi() { return _wi; }
+ private:
+ NotifyAll::When _commitNumber;
+ bool _hasWritten;
+ Writes _wi; // todo: fix name
+ size_t _bytes;
+ public:
+ NotifyAll _notify; // for getlasterror fsync:true acknowledgements
+ unsigned _nSinceCommitIfNeededCall;
+ };
+
+ extern CommitJob& commitJob;
+
+ }
+}
diff --git a/src/mongo/db/dur_journal.cpp b/src/mongo/db/dur_journal.cpp
new file mode 100644
index 00000000000..6a6609f55ee
--- /dev/null
+++ b/src/mongo/db/dur_journal.cpp
@@ -0,0 +1,748 @@
+// @file dur_journal.cpp writing to the writeahead logging journal
+
+/**
+* Copyright (C) 2010 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "client.h"
+#include "namespace.h"
+#include "dur_journal.h"
+#include "dur_journalformat.h"
+#include "dur_stats.h"
+#include "../util/logfile.h"
+#include "../util/timer.h"
+#include "../util/alignedbuilder.h"
+#include "../util/net/listen.h" // getelapsedtimemillis
+#include <boost/static_assert.hpp>
+#include <boost/filesystem.hpp>
+#undef assert
+#define assert MONGO_assert
+#include "../util/mongoutils/str.h"
+#include "dur_journalimpl.h"
+#include "../util/file.h"
+#include "../util/checksum.h"
+#include "../util/concurrency/race.h"
+#include "../util/compress.h"
+#include "../server.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+
+ class AlignedBuilder;
+
+ unsigned goodRandomNumberSlow();
+
+ namespace dur {
+ // Rotate after reaching this data size in a journal (j._<n>) file
+ // We use a smaller size for 32 bit as the journal is mmapped during recovery (only)
+ // Note if you take a set of datafiles, including journal files, from 32->64 or vice-versa, it must
+ // work. (and should as-is)
+ // --smallfiles makes the limit small.
+
+#if defined(_DEBUG)
+ unsigned long long DataLimitPerJournalFile = 128 * 1024 * 1024;
+#elif defined(__APPLE__)
+ // assuming a developer box if OS X
+ unsigned long long DataLimitPerJournalFile = 256 * 1024 * 1024;
+#else
+ unsigned long long DataLimitPerJournalFile = (sizeof(void*)==4) ? 256 * 1024 * 1024 : 1 * 1024 * 1024 * 1024;
+#endif
+
+ BOOST_STATIC_ASSERT( sizeof(Checksum) == 16 );
+ BOOST_STATIC_ASSERT( sizeof(JHeader) == 8192 );
+ BOOST_STATIC_ASSERT( sizeof(JSectHeader) == 20 );
+ BOOST_STATIC_ASSERT( sizeof(JSectFooter) == 32 );
+ BOOST_STATIC_ASSERT( sizeof(JEntry) == 12 );
+ BOOST_STATIC_ASSERT( sizeof(LSNFile) == 88 );
+
+ bool usingPreallocate = false;
+
+ void removeOldJournalFile(path p);
+
+ boost::filesystem::path getJournalDir() {
+ boost::filesystem::path p(dbpath);
+ p /= "journal";
+ return p;
+ }
+
+ path lsnPath() {
+ return getJournalDir()/"lsn";
+ }
+
+ /** this should be called when something really bad happens so that we can flag appropriately
+ */
+ void journalingFailure(const char *msg) {
+ /** todo:
+ (1) don't log too much
+ (2) make an indicator in the journal dir that something bad happened.
+ (2b) refuse to do a recovery startup if that is there without manual override.
+ */
+ log() << "journaling failure/error: " << msg << endl;
+ assert(false);
+ }
+
+ JSectFooter::JSectFooter() {
+ memset(this, 0, sizeof(*this));
+ sentinel = JEntry::OpCode_Footer;
+ }
+
+ JSectFooter::JSectFooter(const void* begin, int len) { // needs buffer to compute hash
+ sentinel = JEntry::OpCode_Footer;
+ reserved = 0;
+ magic[0] = magic[1] = magic[2] = magic[3] = '\n';
+
+ Checksum c;
+ c.gen(begin, (unsigned) len);
+ memcpy(hash, c.bytes, sizeof(hash));
+ }
+
+ bool JSectFooter::checkHash(const void* begin, int len) const {
+ if( !magicOk() ) {
+ log() << "journal footer not valid" << endl;
+ return false;
+ }
+ Checksum c;
+ c.gen(begin, len);
+ DEV log() << "checkHash len:" << len << " hash:" << toHex(hash, 16) << " current:" << toHex(c.bytes, 16) << endl;
+ if( memcmp(hash, c.bytes, sizeof(hash)) == 0 )
+ return true;
+ log() << "journal checkHash mismatch, got: " << toHex(c.bytes, 16) << " expected: " << toHex(hash,16) << endl;
+ return false;
+ }
+
+ JHeader::JHeader(string fname) {
+ magic[0] = 'j'; magic[1] = '\n';
+ _version = CurrentVersion;
+ memset(ts, 0, sizeof(ts));
+ time_t t = time(0);
+ strncpy(ts, time_t_to_String_short(t).c_str(), sizeof(ts)-1);
+ memset(dbpath, 0, sizeof(dbpath));
+ strncpy(dbpath, fname.c_str(), sizeof(dbpath)-1);
+ {
+ fileId = t&0xffffffff;
+ fileId |= ((unsigned long long)goodRandomNumberSlow()) << 32;
+ }
+ memset(reserved3, 0, sizeof(reserved3));
+ txt2[0] = txt2[1] = '\n';
+ n1 = n2 = n3 = n4 = '\n';
+ }
+
+ Journal j;
+
+ const unsigned long long LsnShutdownSentinel = ~((unsigned long long)0);
+
+ Journal::Journal() :
+ _curLogFileMutex("JournalLfMutex") {
+ _ageOut = true;
+ _written = 0;
+ _nextFileNumber = 0;
+ _curLogFile = 0;
+ _curFileId = 0;
+ _preFlushTime = 0;
+ _lastFlushTime = 0;
+ _writeToLSNNeeded = false;
+ }
+
+ path Journal::getFilePathFor(int filenumber) const {
+ boost::filesystem::path p(dir);
+ p /= string(str::stream() << "j._" << filenumber);
+ return p;
+ }
+
+ /** never throws
+ @return true if journal dir is not empty
+ */
+ bool haveJournalFiles() {
+ try {
+ for ( boost::filesystem::directory_iterator i( getJournalDir() );
+ i != boost::filesystem::directory_iterator();
+ ++i ) {
+ string fileName = boost::filesystem::path(*i).leaf();
+ if( str::startsWith(fileName, "j._") )
+ return true;
+ }
+ }
+ catch(...) { }
+ return false;
+ }
+
+ /** throws */
+ void removeJournalFiles() {
+ log() << "removeJournalFiles" << endl;
+ try {
+ for ( boost::filesystem::directory_iterator i( getJournalDir() );
+ i != boost::filesystem::directory_iterator();
+ ++i ) {
+ string fileName = boost::filesystem::path(*i).leaf();
+ if( str::startsWith(fileName, "j._") ) {
+ try {
+ removeOldJournalFile(*i);
+ }
+ catch(std::exception& e) {
+ log() << "couldn't remove " << fileName << ' ' << e.what() << endl;
+ throw;
+ }
+ }
+ }
+ try {
+ boost::filesystem::remove(lsnPath());
+ }
+ catch(...) {
+ log() << "couldn't remove " << lsnPath().string() << endl;
+ throw;
+ }
+ }
+ catch( std::exception& e ) {
+ log() << "error removing journal files " << e.what() << endl;
+ throw;
+ }
+ assert(!haveJournalFiles());
+
+ flushMyDirectory(getJournalDir() / "file"); // flushes parent of argument (in this case journal dir)
+
+ log(1) << "removeJournalFiles end" << endl;
+ }
+
+ /** at clean shutdown */
+ bool okToCleanUp = false; // successful recovery would set this to true
+ void Journal::cleanup(bool _log) {
+ if( !okToCleanUp )
+ return;
+
+ if( _log )
+ log() << "journalCleanup..." << endl;
+ try {
+ SimpleMutex::scoped_lock lk(_curLogFileMutex);
+ closeCurrentJournalFile();
+ removeJournalFiles();
+ }
+ catch(std::exception& e) {
+ log() << "error couldn't remove journal file during shutdown " << e.what() << endl;
+ throw;
+ }
+ }
+ void journalCleanup(bool log) { j.cleanup(log); }
+
+ bool _preallocateIsFaster() {
+ bool faster = false;
+ boost::filesystem::path p = getJournalDir() / "tempLatencyTest";
+ try { remove(p); } catch(...) { }
+ try {
+ AlignedBuilder b(8192);
+ int millis[2];
+ const int N = 50;
+ for( int pass = 0; pass < 2; pass++ ) {
+ LogFile f(p.string());
+ Timer t;
+ for( int i = 0 ; i < N; i++ ) {
+ f.synchronousAppend(b.buf(), 8192);
+ }
+ millis[pass] = t.millis();
+ // second time through, file exists and is prealloc case
+ }
+ int diff = millis[0] - millis[1];
+ if( diff > 2 * N ) {
+ // at least 2ms faster for prealloc case?
+ faster = true;
+ log() << "preallocateIsFaster=true " << diff / (1.0*N) << endl;
+ }
+ }
+ catch(...) {
+ log() << "info preallocateIsFaster couldn't run; returning false" << endl;
+ }
+ try { remove(p); } catch(...) { }
+ return faster;
+ }
+ bool preallocateIsFaster() {
+ Timer t;
+ bool res = false;
+ if( _preallocateIsFaster() && _preallocateIsFaster() ) {
+ // maybe system is just super busy at the moment? sleep a second to let it calm down.
+ // deciding to to prealloc is a medium big decision:
+ sleepsecs(1);
+ res = _preallocateIsFaster();
+ }
+ if( t.millis() > 3000 )
+ log() << "preallocateIsFaster check took " << t.millis()/1000.0 << " secs" << endl;
+ return res;
+ }
+
+ // throws
+ void preallocateFile(boost::filesystem::path p, unsigned long long len) {
+ if( exists(p) )
+ return;
+
+ log() << "preallocating a journal file " << p.string() << endl;
+
+ const unsigned BLKSZ = 1024 * 1024;
+ assert( len % BLKSZ == 0 );
+
+ AlignedBuilder b(BLKSZ);
+ memset((void*)b.buf(), 0, BLKSZ);
+
+ ProgressMeter m(len, 3/*secs*/, 10/*hits between time check (once every 6.4MB)*/);
+
+ File f;
+ f.open( p.string().c_str() , /*read-only*/false , /*direct-io*/false );
+ assert( f.is_open() );
+ fileofs loc = 0;
+ while ( loc < len ) {
+ f.write( loc , b.buf() , BLKSZ );
+ loc += BLKSZ;
+ m.hit(BLKSZ);
+ }
+ assert( loc == len );
+ f.fsync();
+ }
+
+ const int NUM_PREALLOC_FILES = 3;
+ inline boost::filesystem::path preallocPath(int n) {
+ assert(n >= 0);
+ assert(n < NUM_PREALLOC_FILES);
+ string fn = str::stream() << "prealloc." << n;
+ return getJournalDir() / fn;
+ }
+
+ // throws
+ void _preallocateFiles() {
+ for( int i = 0; i < NUM_PREALLOC_FILES; i++ ) {
+ boost::filesystem::path filepath = preallocPath(i);
+
+ unsigned long long limit = DataLimitPerJournalFile;
+ if( debug && i == 1 ) {
+ // moving 32->64, the prealloc files would be short. that is "ok", but we want to exercise that
+ // case, so we force exercising here when _DEBUG is set by arbitrarily stopping prealloc at a low
+ // limit for a file. also we want to be able to change in the future the constant without a lot of
+ // work anyway.
+ limit = 16 * 1024 * 1024;
+ }
+ preallocateFile(filepath, limit);
+ }
+ }
+
+ void checkFreeSpace() {
+ unsigned long long spaceNeeded = static_cast<unsigned long long>(3 * DataLimitPerJournalFile * 1.1); // add 10% for headroom
+ unsigned long long freeSpace = File::freeSpace(getJournalDir().string());
+ unsigned long long prealloced = 0;
+ for( int i = 0; i < NUM_PREALLOC_FILES; i++ ) {
+ boost::filesystem::path filepath = preallocPath(i);
+ if (exists(filepath))
+ prealloced += file_size(filepath);
+ }
+
+ if (freeSpace + prealloced < spaceNeeded) {
+ log() << endl;
+ error() << "Insufficient free space for journals." << endl;
+ log() << "Please make at least " << spaceNeeded/(1024*1024) << "MB available in " << getJournalDir().string() << endl;
+ log() << endl;
+ throw UserException(15926, "Insufficient free space for journals");
+ }
+ }
+
+ void preallocateFiles() {
+ if (! (cmdLine.durOptions & CmdLine::DurNoCheckSpace))
+ checkFreeSpace();
+
+ if( exists(preallocPath(0)) || // if enabled previously, keep using
+ exists(preallocPath(1)) ||
+ ( cmdLine.preallocj && preallocateIsFaster() ) ) {
+ usingPreallocate = true;
+ try {
+ _preallocateFiles();
+ }
+ catch(...) {
+ log() << "warning caught exception in preallocateFiles, continuing" << endl;
+ }
+ }
+ j.open();
+ }
+
+ void removeOldJournalFile(path p) {
+ if( usingPreallocate ) {
+ try {
+ for( int i = 0; i < NUM_PREALLOC_FILES; i++ ) {
+ boost::filesystem::path filepath = preallocPath(i);
+ if( !boost::filesystem::exists(filepath) ) {
+ // we can recycle this file into this prealloc file location
+ boost::filesystem::path temppath = filepath.string() + ".temp";
+ boost::filesystem::rename(p, temppath);
+ {
+ // zero the header
+ File f;
+ f.open(temppath.string().c_str(), false, false);
+ char buf[8192];
+ memset(buf, 0, 8192);
+ f.write(0, buf, 8192);
+ f.truncate(DataLimitPerJournalFile);
+ f.fsync();
+ }
+ boost::filesystem::rename(temppath, filepath);
+ return;
+ }
+ }
+ } catch(...) {
+ log() << "warning exception in dur::removeOldJournalFile " << p.string() << endl;
+ // fall through and try to delete the file
+ }
+ }
+
+ // already have 3 prealloc files, so delete this file
+ try {
+ boost::filesystem::remove(p);
+ }
+ catch(...) {
+ log() << "warning exception removing " << p.string() << endl;
+ }
+ }
+
+ // find a prealloc.<n> file, presumably to take and use
+ path findPrealloced() {
+ try {
+ for( int i = 0; i < NUM_PREALLOC_FILES; i++ ) {
+ boost::filesystem::path filepath = preallocPath(i);
+ if( boost::filesystem::exists(filepath) )
+ return filepath;
+ }
+ } catch(...) {
+ log() << "warning exception in dur::findPrealloced()" << endl;
+ }
+ return path();
+ }
+
+ /** assure journal/ dir exists. throws. call during startup. */
+ void journalMakeDir() {
+ j.init();
+
+ boost::filesystem::path p = getJournalDir();
+ j.dir = p.string();
+ log() << "journal dir=" << j.dir << endl;
+ if( !exists(j.dir) ) {
+ try {
+ create_directory(j.dir);
+ }
+ catch(std::exception& e) {
+ log() << "error creating directory " << j.dir << ' ' << e.what() << endl;
+ throw;
+ }
+ }
+ }
+
+ void Journal::_open() {
+ _curFileId = 0;
+ assert( _curLogFile == 0 );
+ path fname = getFilePathFor(_nextFileNumber);
+
+ // if we have a prealloced file, use it
+ {
+ path p = findPrealloced();
+ if( !p.empty() ) {
+ try {
+ {
+ // JHeader::fileId must be updated before renaming to be race-safe
+ LogFile f(p.string());
+ JHeader h(p.string());
+ AlignedBuilder b(8192);
+ b.appendStruct(h);
+ f.synchronousAppend(b.buf(), b.len());
+ }
+ boost::filesystem::rename(p, fname);
+ }
+ catch(...) {
+ log() << "warning couldn't write to / rename file " << p.string() << endl;
+ }
+ }
+ }
+
+ _curLogFile = new LogFile(fname.string());
+ _nextFileNumber++;
+ {
+ JHeader h(fname.string());
+ _curFileId = h.fileId;
+ assert(_curFileId);
+ AlignedBuilder b(8192);
+ b.appendStruct(h);
+ _curLogFile->synchronousAppend(b.buf(), b.len());
+ }
+ }
+
+ void Journal::init() {
+ assert( _curLogFile == 0 );
+ MongoFile::notifyPreFlush = preFlush;
+ MongoFile::notifyPostFlush = postFlush;
+ }
+
+ void Journal::open() {
+ assert( MongoFile::notifyPreFlush == preFlush );
+ SimpleMutex::scoped_lock lk(_curLogFileMutex);
+ _open();
+ }
+
+ void LSNFile::set(unsigned long long x) {
+ memset(this, 0, sizeof(*this));
+ lsn = x;
+ checkbytes = ~x;
+ }
+
+ /** logs details of the situation, and returns 0, if anything surprising in the LSNFile
+ if something highly surprising, throws to abort
+ */
+ unsigned long long LSNFile::get() {
+ uassert(13614, str::stream() << "unexpected version number of lsn file in journal/ directory got: " << ver , ver == 0);
+ if( ~lsn != checkbytes ) {
+ log() << "lsnfile not valid. recovery will be from log start. lsn: " << hex << lsn << " checkbytes: " << hex << checkbytes << endl;
+ return 0;
+ }
+ return lsn;
+ }
+
+ /** called during recovery (the error message text below assumes that)
+ */
+ unsigned long long journalReadLSN() {
+ if( !MemoryMappedFile::exists(lsnPath()) ) {
+ log() << "info no lsn file in journal/ directory" << endl;
+ return 0;
+ }
+
+ try {
+ // os can flush as it likes. if it flushes slowly, we will just do extra work on recovery.
+ // however, given we actually close the file when writing, that seems unlikely.
+ LSNFile L;
+ File f;
+ f.open(lsnPath().string().c_str());
+ assert(f.is_open());
+ if( f.len() == 0 ) {
+ // this could be 'normal' if we crashed at the right moment
+ log() << "info lsn file is zero bytes long" << endl;
+ return 0;
+ }
+ f.read(0,(char*)&L, sizeof(L));
+ unsigned long long lsn = L.get();
+ return lsn;
+ }
+ catch(std::exception& e) {
+ uasserted(13611, str::stream() << "can't read lsn file in journal directory : " << e.what());
+ }
+ return 0;
+ }
+
+ unsigned long long getLastDataFileFlushTime() {
+ return j.lastFlushTime();
+ }
+
+ /** remember "last sequence number" to speed recoveries
+ concurrency: called by durThread only.
+ */
+ void Journal::updateLSNFile() {
+ RACECHECK
+ if( !_writeToLSNNeeded )
+ return;
+ _writeToLSNNeeded = false;
+ try {
+ // os can flush as it likes. if it flushes slowly, we will just do extra work on recovery.
+ // however, given we actually close the file, that seems unlikely.
+ File f;
+ f.open(lsnPath().string().c_str());
+ if( !f.is_open() ) {
+ // can get 0 if an i/o error
+ log() << "warning: open of lsn file failed" << endl;
+ return;
+ }
+ LOG(1) << "lsn set " << _lastFlushTime << endl;
+ LSNFile lsnf;
+ lsnf.set(_lastFlushTime);
+ f.write(0, (char*)&lsnf, sizeof(lsnf));
+ // do we want to fsync here? if we do it probably needs to be async so the durthread
+ // is not delayed.
+ }
+ catch(std::exception& e) {
+ log() << "warning: write to lsn file failed " << e.what() << endl;
+ // keep running (ignore the error). recovery will be slow.
+ }
+ }
+
+ void Journal::preFlush() {
+ j._preFlushTime = Listener::getElapsedTimeMillis();
+ }
+
+ void Journal::postFlush() {
+ j._lastFlushTime = j._preFlushTime;
+ j._writeToLSNNeeded = true;
+ }
+
+ // call from within _curLogFileMutex
+ void Journal::closeCurrentJournalFile() {
+ if (!_curLogFile)
+ return;
+
+ JFile jf;
+ jf.filename = _curLogFile->_name;
+ jf.lastEventTimeMs = Listener::getElapsedTimeMillis();
+ _oldJournalFiles.push_back(jf);
+
+ delete _curLogFile; // close
+ _curLogFile = 0;
+ _written = 0;
+ }
+
+ /** remove older journal files.
+ be in _curLogFileMutex but not dbMutex when calling
+ */
+ void Journal::removeUnneededJournalFiles() {
+ while( !_oldJournalFiles.empty() ) {
+ JFile f = _oldJournalFiles.front();
+
+ if( f.lastEventTimeMs < _lastFlushTime + ExtraKeepTimeMs ) {
+ // eligible for deletion
+ path p( f.filename );
+ log() << "old journal file will be removed: " << f.filename << endl;
+ removeOldJournalFile(p);
+ }
+ else {
+ break;
+ }
+
+ _oldJournalFiles.pop_front();
+ }
+ }
+
+ /*int getAgeOutJournalFiles() {
+ mutex::try_lock lk(j._curLogFileMutex, 4000);
+ if( !lk.ok )
+ return -1;
+ return j._ageOut ? 1 : 0;
+ }*/
+ void setAgeOutJournalFiles(bool a) {
+ SimpleMutex::scoped_lock lk(j._curLogFileMutex);
+ j._ageOut = a;
+ }
+
+ void Journal::_rotate() {
+ if( d.dbMutex.atLeastReadLocked() ) {
+ LOGSOME << "info journal _rotate called insider dbMutex - ok but should be somewhat rare" << endl;
+ }
+
+ RACECHECK;
+
+ _curLogFileMutex.dassertLocked();
+
+ if ( inShutdown() || !_curLogFile )
+ return;
+
+ j.updateLSNFile();
+
+ if( _curLogFile && _written < DataLimitPerJournalFile )
+ return;
+
+ if( _curLogFile ) {
+ _curLogFile->truncate();
+ closeCurrentJournalFile();
+ removeUnneededJournalFiles();
+ }
+
+ try {
+ Timer t;
+ _open();
+ int ms = t.millis();
+ if( ms >= 200 ) {
+ log() << "DR101 latency warning on journal file open " << ms << "ms" << endl;
+ }
+ }
+ catch(std::exception& e) {
+ log() << "warning exception opening journal file " << e.what() << endl;
+ throw;
+ }
+ }
+
+ /** write (append) the buffer we have built to the journal and fsync it.
+ outside of dbMutex lock as this could be slow.
+ @param uncompressed - a buffer that will be written to the journal after compression
+ will not return until on disk
+ */
+ void WRITETOJOURNAL(JSectHeader h, AlignedBuilder& uncompressed) {
+ Timer t;
+ j.journal(h, uncompressed);
+ stats.curr->_writeToJournalMicros += t.micros();
+ }
+ void Journal::journal(const JSectHeader& h, const AlignedBuilder& uncompressed) {
+ RACECHECK
+ static AlignedBuilder b(32*1024*1024);
+ /* buffer to journal will be
+ JSectHeader
+ compressed operations
+ JSectFooter
+ */
+ const unsigned headTailSize = sizeof(JSectHeader) + sizeof(JSectFooter);
+ const unsigned max = maxCompressedLength(uncompressed.len()) + headTailSize;
+ b.reset(max);
+
+ {
+ dassert( h.sectionLen() == (unsigned) 0xffffffff ); // we will backfill later
+ b.appendStruct(h);
+ }
+
+ size_t compressedLength = 0;
+ rawCompress(uncompressed.buf(), uncompressed.len(), b.cur(), &compressedLength);
+ assert( compressedLength < 0xffffffff );
+ assert( compressedLength < max );
+ b.skip(compressedLength);
+
+ // footer
+ unsigned L = 0xffffffff;
+ {
+ // pad to alignment, and set the total section length in the JSectHeader
+ assert( 0xffffe000 == (~(Alignment-1)) );
+ unsigned lenUnpadded = b.len() + sizeof(JSectFooter);
+ L = (lenUnpadded + Alignment-1) & (~(Alignment-1));
+ dassert( L >= lenUnpadded );
+
+ ((JSectHeader*)b.atOfs(0))->setSectionLen(lenUnpadded);
+
+ JSectFooter f(b.buf(), b.len()); // computes checksum
+ b.appendStruct(f);
+ dassert( b.len() == lenUnpadded );
+
+ b.skip(L - lenUnpadded);
+ dassert( b.len() % Alignment == 0 );
+ }
+
+ try {
+ SimpleMutex::scoped_lock lk(_curLogFileMutex);
+
+ // must already be open -- so that _curFileId is correct for previous buffer building
+ assert( _curLogFile );
+
+ stats.curr->_uncompressedBytes += b.len();
+ unsigned w = b.len();
+ _written += w;
+ assert( w <= L );
+ stats.curr->_journaledBytes += L;
+ _curLogFile->synchronousAppend((const void *) b.buf(), L);
+ _rotate();
+ }
+ catch(std::exception& e) {
+ log() << "error exception in dur::journal " << e.what() << endl;
+ throw;
+ }
+ }
+
+ }
+}
+
+/* todo
+ test (and handle) disk full on journal append. best quick thing to do is to terminate.
+ if we roll back operations, there are nuances such as is ReplSetImpl::lastOpTimeWritten too new in ram then?
+*/
diff --git a/src/mongo/db/dur_journal.h b/src/mongo/db/dur_journal.h
new file mode 100644
index 00000000000..664f63942e0
--- /dev/null
+++ b/src/mongo/db/dur_journal.h
@@ -0,0 +1,68 @@
+// @file dur_journal.h
+
+/**
+* Copyright (C) 2010 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+ class AlignedBuilder;
+
+ namespace dur {
+
+ /** true if ok to cleanup journal files at termination. otherwise, files journal will be retained.
+ */
+ extern bool okToCleanUp;
+
+ /** at termination after db files closed & fsynced
+ also after recovery
+ closes and removes journal files
+ @param log report in log that we are cleaning up if we actually do any work
+ */
+ void journalCleanup(bool log = false);
+
+ /** assure journal/ dir exists. throws */
+ void journalMakeDir();
+
+ /** check if time to rotate files; assure a file is open.
+ done separately from the journal() call as we can do this part
+ outside of lock.
+ only called by durThread.
+ */
+ void journalRotate();
+
+ /** flag that something has gone wrong during writing to the journal
+ (not for recovery mode)
+ */
+ void journalingFailure(const char *msg);
+
+ /** read lsn from disk from the last run before doing recovery */
+ unsigned long long journalReadLSN();
+
+ unsigned long long getLastDataFileFlushTime();
+
+ /** never throws.
+ @return true if there are any journal files in the journal dir.
+ */
+ bool haveJournalFiles();
+
+ // in case disk controller buffers writes
+ const long long ExtraKeepTimeMs = 10000;
+
+ const unsigned JournalCommitIntervalDefault = 100;
+
+ }
+}
diff --git a/src/mongo/db/dur_journalformat.h b/src/mongo/db/dur_journalformat.h
new file mode 100644
index 00000000000..10ed8487b71
--- /dev/null
+++ b/src/mongo/db/dur_journalformat.h
@@ -0,0 +1,174 @@
+// @file dur_journalformat.h The format of our journal files.
+
+/**
+* Copyright (C) 2010 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+
+ namespace dur {
+
+ const unsigned Alignment = 8192;
+
+#pragma pack(1)
+ /** beginning header for a journal/j._<n> file
+ there is nothing important int this header at this time. except perhaps version #.
+ */
+ struct JHeader {
+ JHeader() { }
+ JHeader(string fname);
+
+ char magic[2]; // "j\n". j means journal, then a linefeed, fwiw if you were to run "less" on the file or something...
+
+ // x4142 is asci--readable if you look at the file with head/less -- thus the starting values were near
+ // that. simply incrementing the version # is safe on a fwd basis.
+#if defined(_NOCOMPRESS)
+ enum { CurrentVersion = 0x4148 };
+#else
+ enum { CurrentVersion = 0x4149 };
+#endif
+ unsigned short _version;
+
+ // these are just for diagnostic ease (make header more useful as plain text)
+ char n1; // '\n'
+ char ts[20]; // ascii timestamp of file generation. for user reading, not used by code.
+ char n2; // '\n'
+ char dbpath[128]; // path/filename of this file for human reading and diagnostics. not used by code.
+ char n3, n4; // '\n', '\n'
+
+ unsigned long long fileId; // unique identifier that will be in each JSectHeader. important as we recycle prealloced files
+
+ char reserved3[8026]; // 8KB total for the file header
+ char txt2[2]; // "\n\n" at the end
+
+ bool versionOk() const { return _version == CurrentVersion; }
+ bool valid() const { return magic[0] == 'j' && txt2[1] == '\n' && fileId; }
+ };
+
+ /** "Section" header. A section corresponds to a group commit.
+ len is length of the entire section including header and footer.
+ header and footer are not compressed, just the stuff in between.
+ */
+ struct JSectHeader {
+ private:
+ unsigned _sectionLen; // unpadded length in bytes of the whole section
+ public:
+ unsigned long long seqNumber; // sequence number that can be used on recovery to not do too much work
+ unsigned long long fileId; // matches JHeader::fileId
+ unsigned sectionLen() const { return _sectionLen; }
+
+ // we store the unpadded length so we can use that when we uncompress. to
+ // get the true total size this must be rounded up to the Alignment.
+ void setSectionLen(unsigned lenUnpadded) { _sectionLen = lenUnpadded; }
+
+ unsigned sectionLenWithPadding() const {
+ unsigned x = (sectionLen() + (Alignment-1)) & (~(Alignment-1));
+ dassert( x % Alignment == 0 );
+ return x;
+ }
+ };
+
+ /** an individual write operation within a group commit section. Either the entire section should
+ be applied, or nothing. (We check the md5 for the whole section before doing anything on recovery.)
+ */
+ struct JEntry {
+ enum OpCodes {
+ OpCode_Footer = 0xffffffff,
+ OpCode_DbContext = 0xfffffffe,
+ OpCode_FileCreated = 0xfffffffd,
+ OpCode_DropDb = 0xfffffffc,
+ OpCode_Min = 0xfffff000
+ };
+ union {
+ unsigned len; // length in bytes of the data of the JEntry. does not include the JEntry header
+ OpCodes opcode;
+ };
+
+ unsigned ofs; // offset in file
+
+ // sentinel and masks for _fileNo
+ enum {
+ DotNsSuffix = 0x7fffffff, // ".ns" file
+ LocalDbBit = 0x80000000 // assuming "local" db instead of using the JDbContext
+ };
+ int _fileNo; // high bit is set to indicate it should be the <dbpath>/local database
+ // char data[len] follows
+
+ const char * srcData() const {
+ const int *i = &_fileNo;
+ return (const char *) (i+1);
+ }
+
+ int getFileNo() const { return _fileNo & (~LocalDbBit); }
+ void setFileNo(int f) { _fileNo = f; }
+ bool isNsSuffix() const { return getFileNo() == DotNsSuffix; }
+
+ void setLocalDbContextBit() { _fileNo |= LocalDbBit; }
+ bool isLocalDbContext() const { return _fileNo & LocalDbBit; }
+ void clearLocalDbContextBit() { _fileNo = getFileNo(); }
+
+ static string suffix(int fileno) {
+ if( fileno == DotNsSuffix ) return "ns";
+ stringstream ss;
+ ss << fileno;
+ return ss.str();
+ }
+ };
+
+ /** group commit section footer. md5 is a key field. */
+ struct JSectFooter {
+ JSectFooter();
+ JSectFooter(const void* begin, int len); // needs buffer to compute hash
+ unsigned sentinel;
+ unsigned char hash[16];
+ unsigned long long reserved;
+ char magic[4]; // "\n\n\n\n"
+
+ /** used by recovery to see if buffer is valid
+ @param begin the buffer
+ @param len buffer len
+ @return true if buffer looks valid
+ */
+ bool checkHash(const void* begin, int len) const;
+
+ bool magicOk() const { return *((unsigned*)magic) == 0x0a0a0a0a; }
+ };
+
+ /** declares "the next entry(s) are for this database / file path prefix" */
+ struct JDbContext {
+ JDbContext() : sentinel(JEntry::OpCode_DbContext) { }
+ const unsigned sentinel; // compare to JEntry::len -- zero is our sentinel
+ //char dbname[];
+ };
+
+ /** "last sequence number" */
+ struct LSNFile {
+ unsigned ver;
+ unsigned reserved2;
+ unsigned long long lsn;
+ unsigned long long checkbytes;
+ unsigned long long reserved[8];
+
+ void set(unsigned long long lsn);
+ unsigned long long get();
+ };
+
+#pragma pack()
+
+ }
+
+}
diff --git a/src/mongo/db/dur_journalimpl.h b/src/mongo/db/dur_journalimpl.h
new file mode 100644
index 00000000000..8aad70b0e5c
--- /dev/null
+++ b/src/mongo/db/dur_journalimpl.h
@@ -0,0 +1,103 @@
+// @file dur_journal.h
+
+/**
+* Copyright (C) 2010 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "dur_journalformat.h"
+#include "../util/logfile.h"
+
+namespace mongo {
+ namespace dur {
+
+ /** the writeahead journal for durability */
+ class Journal {
+ public:
+ string dir; // set by journalMakeDir() during initialization
+
+ Journal();
+
+ /** call during startup by journalMakeDir() */
+ void init();
+
+ /** check if time to rotate files. assure a file is open.
+ done separately from the journal() call as we can do this part
+ outside of lock.
+ thread: durThread()
+ */
+ void rotate();
+
+ /** append to the journal file
+ */
+ void journal(const JSectHeader& h, const AlignedBuilder& b);
+
+ boost::filesystem::path getFilePathFor(int filenumber) const;
+
+ unsigned long long lastFlushTime() const { return _lastFlushTime; }
+ void cleanup(bool log); // closes and removes journal files
+
+ unsigned long long curFileId() const { return _curFileId; }
+
+ void assureLogFileOpen() {
+ SimpleMutex::scoped_lock lk(_curLogFileMutex);
+ if( _curLogFile == 0 )
+ _open();
+ }
+
+ /** open a journal file to journal operations to. */
+ void open();
+
+ private:
+ /** check if time to rotate files. assure a file is open.
+ * internally called with every commit
+ */
+ void _rotate();
+
+ void _open();
+ void closeCurrentJournalFile();
+ void removeUnneededJournalFiles();
+
+ unsigned long long _written; // bytes written so far to the current journal (log) file
+ unsigned _nextFileNumber;
+ public:
+ SimpleMutex _curLogFileMutex;
+ bool _ageOut;
+ private:
+
+ LogFile *_curLogFile; // use _curLogFileMutex
+ unsigned long long _curFileId; // current file id see JHeader::fileId
+
+ struct JFile {
+ string filename;
+ unsigned long long lastEventTimeMs;
+ };
+
+ // files which have been closed but not unlinked (rotated out) yet
+ // ordered oldest to newest
+ list<JFile> _oldJournalFiles; // use _curLogFileMutex
+
+ // lsn related
+ static void preFlush();
+ static void postFlush();
+ unsigned long long _preFlushTime;
+ unsigned long long _lastFlushTime; // data < this time is fsynced in the datafiles (unless hard drive controller is caching)
+ bool _writeToLSNNeeded;
+ void updateLSNFile();
+ };
+
+ }
+}
diff --git a/src/mongo/db/dur_preplogbuffer.cpp b/src/mongo/db/dur_preplogbuffer.cpp
new file mode 100644
index 00000000000..10b63c0e549
--- /dev/null
+++ b/src/mongo/db/dur_preplogbuffer.cpp
@@ -0,0 +1,177 @@
+// @file dur_preplogbuffer.cpp
+
+/**
+* Copyright (C) 2009 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+ PREPLOGBUFFER
+ we will build an output buffer ourself and then use O_DIRECT
+ we could be in read lock for this
+ for very large objects write directly to redo log in situ?
+ @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc
+*/
+
+#include "pch.h"
+#include "cmdline.h"
+#include "dur.h"
+#include "dur_journal.h"
+#include "dur_journalimpl.h"
+#include "dur_commitjob.h"
+#include "../util/mongoutils/hash.h"
+#include "../util/mongoutils/str.h"
+#include "../util/alignedbuilder.h"
+#include "../util/timer.h"
+#include "dur_stats.h"
+#include "../server.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+ namespace dur {
+
+ extern Journal j;
+
+ RelativePath local = RelativePath::fromRelativePath("local");
+
+ static MongoMMF* findMMF_inlock(void *ptr, size_t &ofs) {
+ MongoMMF *f = privateViews.find_inlock(ptr, ofs);
+ if( f == 0 ) {
+ error() << "findMMF_inlock failed " << privateViews.numberOfViews_inlock() << endl;
+ printStackTrace(); // we want a stack trace and the assert below didn't print a trace once in the real world - not sure why
+ stringstream ss;
+ ss << "view pointer cannot be resolved " << hex << (size_t) ptr;
+ journalingFailure(ss.str().c_str()); // asserts, which then abends
+ }
+ return f;
+ }
+
+ /** put the basic write operation into the buffer (bb) to be journaled */
+ static void prepBasicWrite_inlock(AlignedBuilder&bb, const WriteIntent *i, RelativePath& lastDbPath) {
+ size_t ofs = 1;
+ MongoMMF *mmf = findMMF_inlock(i->start(), /*out*/ofs);
+
+ if( unlikely(!mmf->willNeedRemap()) ) {
+ // tag this mmf as needed a remap of its private view later.
+ // usually it will already be dirty/already set, so we do the if above first
+ // to avoid possibility of cpu cache line contention
+ mmf->willNeedRemap() = true;
+ }
+
+ // since we have already looked up the mmf, we go ahead and remember the write view location
+ // so we don't have to find the MongoMMF again later in WRITETODATAFILES()
+ //
+ // this was for WRITETODATAFILES_Impl2 so commented out now
+ //
+ /*
+ dassert( i->w_ptr == 0 );
+ i->w_ptr = ((char*)mmf->view_write()) + ofs;
+ */
+
+ JEntry e;
+ e.len = min(i->length(), (unsigned)(mmf->length() - ofs)); //dont write past end of file
+ assert( ofs <= 0x80000000 );
+ e.ofs = (unsigned) ofs;
+ e.setFileNo( mmf->fileSuffixNo() );
+ if( mmf->relativePath() == local ) {
+ e.setLocalDbContextBit();
+ }
+ else if( mmf->relativePath() != lastDbPath ) {
+ lastDbPath = mmf->relativePath();
+ JDbContext c;
+ bb.appendStruct(c);
+ bb.appendStr(lastDbPath.toString());
+ }
+ bb.appendStruct(e);
+#if defined(_EXPERIMENTAL)
+ i->ofsInJournalBuffer = bb.len();
+#endif
+ bb.appendBuf(i->start(), e.len);
+
+ if (unlikely(e.len != (unsigned)i->length())) {
+ log() << "journal info splitting prepBasicWrite at boundary" << endl;
+
+ // This only happens if we write to the last byte in a file and
+ // the fist byte in another file that is mapped adjacently. I
+ // think most OSs leave at least a one page gap between
+ // mappings, but better to be safe.
+
+ WriteIntent next ((char*)i->start() + e.len, i->length() - e.len);
+ prepBasicWrite_inlock(bb, &next, lastDbPath);
+ }
+ }
+
+ /** basic write ops / write intents. note there is no particular order to these : if we have
+ two writes to the same location during the group commit interval, it is likely
+ (although not assured) that it is journaled here once.
+ */
+ static void prepBasicWrites(AlignedBuilder& bb) {
+ scoped_lock lk(privateViews._mutex());
+
+ // each time events switch to a different database we journal a JDbContext
+ RelativePath lastDbPath;
+
+ for( set<WriteIntent>::iterator i = commitJob.writes().begin(); i != commitJob.writes().end(); i++ ) {
+ prepBasicWrite_inlock(bb, &(*i), lastDbPath);
+ }
+ }
+
+ static void resetLogBuffer(/*out*/JSectHeader& h, AlignedBuilder& bb) {
+ bb.reset();
+
+ h.setSectionLen(0xffffffff); // total length, will fill in later
+ h.seqNumber = getLastDataFileFlushTime();
+ h.fileId = j.curFileId();
+ }
+
+ /** we will build an output buffer ourself and then use O_DIRECT
+ we could be in read lock for this
+ caller handles locking
+ @return partially populated sectheader and _ab set
+ */
+ static void _PREPLOGBUFFER(JSectHeader& h) {
+ assert( cmdLine.dur );
+
+ {
+ // now that we are locked, fully drain deferred notes of write intents
+ DEV d.dbMutex.assertAtLeastReadLocked();
+ Writes& writes = commitJob.wi();
+ writes._deferred.invoke();
+ writes._drained = true;
+ }
+
+ AlignedBuilder& bb = commitJob._ab;
+ resetLogBuffer(h, bb); // adds JSectHeader
+
+ // ops other than basic writes (DurOp's)
+ {
+ for( vector< shared_ptr<DurOp> >::iterator i = commitJob.ops().begin(); i != commitJob.ops().end(); ++i ) {
+ (*i)->serialize(bb);
+ }
+ }
+
+ prepBasicWrites(bb);
+
+ return;
+ }
+ void PREPLOGBUFFER(/*out*/ JSectHeader& h) {
+ Timer t;
+ j.assureLogFileOpen(); // so fileId is set
+ _PREPLOGBUFFER(h);
+ stats.curr->_prepLogBufferMicros += t.micros();
+ }
+
+ }
+}
diff --git a/src/mongo/db/dur_recover.cpp b/src/mongo/db/dur_recover.cpp
new file mode 100644
index 00000000000..a0a8843572c
--- /dev/null
+++ b/src/mongo/db/dur_recover.cpp
@@ -0,0 +1,544 @@
+// @file dur_recover.cpp crash recovery via the journal
+
+/**
+* Copyright (C) 2009 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "dur.h"
+#include "dur_stats.h"
+#include "dur_recover.h"
+#include "dur_journal.h"
+#include "dur_journalformat.h"
+#include "durop.h"
+#include "namespace.h"
+#include "../util/mongoutils/str.h"
+#include "../util/bufreader.h"
+#include "../util/concurrency/race.h"
+#include "pdfile.h"
+#include "database.h"
+#include "db.h"
+#include "../util/unittest.h"
+#include "../util/checksum.h"
+#include "cmdline.h"
+#include "curop.h"
+#include "mongommf.h"
+#include "../util/compress.h"
+
+#include <sys/stat.h>
+#include <fcntl.h>
+
+using namespace mongoutils;
+
+namespace mongo {
+
+ namespace dur {
+
+ struct ParsedJournalEntry { /*copyable*/
+ ParsedJournalEntry() : e(0) { }
+
+ // relative path of database for the operation.
+ // might be a pointer into mmaped Journal file
+ const char *dbName;
+
+ // thse are pointers into the memory mapped journal file
+ const JEntry *e; // local db sentinel is already parsed out here into dbName
+
+ // if not one of the two simple JEntry's above, this is the operation:
+ shared_ptr<DurOp> op;
+ };
+
+ void removeJournalFiles();
+ path getJournalDir();
+
+ /** get journal filenames, in order. throws if unexpected content found */
+ static void getFiles(path dir, vector<path>& files) {
+ map<unsigned,path> m;
+ for ( boost::filesystem::directory_iterator i( dir );
+ i != boost::filesystem::directory_iterator();
+ ++i ) {
+ boost::filesystem::path filepath = *i;
+ string fileName = boost::filesystem::path(*i).leaf();
+ if( str::startsWith(fileName, "j._") ) {
+ unsigned u = str::toUnsigned( str::after(fileName, '_') );
+ if( m.count(u) ) {
+ uasserted(13531, str::stream() << "unexpected files in journal directory " << dir.string() << " : " << fileName);
+ }
+ m.insert( pair<unsigned,path>(u,filepath) );
+ }
+ }
+ for( map<unsigned,path>::iterator i = m.begin(); i != m.end(); ++i ) {
+ if( i != m.begin() && m.count(i->first - 1) == 0 ) {
+ uasserted(13532,
+ str::stream() << "unexpected file in journal directory " << dir.string()
+ << " : " << boost::filesystem::path(i->second).leaf() << " : can't find its preceeding file");
+ }
+ files.push_back(i->second);
+ }
+ }
+
+ /** read through the memory mapped data of a journal file (journal/j._<n> file)
+ throws
+ */
+ class JournalSectionIterator : boost::noncopyable {
+ auto_ptr<BufReader> _entries;
+ const JSectHeader _h;
+ const char *_lastDbName; // pointer into mmaped journal file
+ const bool _doDurOps;
+ string _uncompressed;
+ public:
+ JournalSectionIterator(const JSectHeader& h, const void *compressed, unsigned compressedLen, bool doDurOpsRecovering) :
+ _h(h),
+ _lastDbName(0)
+ , _doDurOps(doDurOpsRecovering)
+ {
+ assert( doDurOpsRecovering );
+ bool ok = uncompress((const char *)compressed, compressedLen, &_uncompressed);
+ if( !ok ) {
+ // it should always be ok (i think?) as there is a previous check to see that the JSectFooter is ok
+ log() << "couldn't uncompress journal section" << endl;
+ msgasserted(15874, "couldn't uncompress journal section");
+ }
+ const char *p = _uncompressed.c_str();
+ assert( compressedLen == _h.sectionLen() - sizeof(JSectFooter) - sizeof(JSectHeader) );
+ _entries = auto_ptr<BufReader>( new BufReader(p, _uncompressed.size()) );
+ }
+
+ // we work with the uncompressed buffer when doing a WRITETODATAFILES (for speed)
+ JournalSectionIterator(const JSectHeader &h, const void *p, unsigned len) :
+ _entries( new BufReader((const char *) p, len) ),
+ _h(h),
+ _lastDbName(0)
+ , _doDurOps(false)
+
+ { }
+
+ bool atEof() const { return _entries->atEof(); }
+
+ unsigned long long seqNumber() const { return _h.seqNumber; }
+
+ /** get the next entry from the log. this function parses and combines JDbContext and JEntry's.
+ * throws on premature end of section.
+ */
+ void next(ParsedJournalEntry& e) {
+ unsigned lenOrOpCode;
+ _entries->read(lenOrOpCode);
+
+ if (lenOrOpCode > JEntry::OpCode_Min) {
+ switch( lenOrOpCode ) {
+
+ case JEntry::OpCode_Footer: {
+ assert( false );
+ }
+
+ case JEntry::OpCode_FileCreated:
+ case JEntry::OpCode_DropDb: {
+ e.dbName = 0;
+ boost::shared_ptr<DurOp> op = DurOp::read(lenOrOpCode, *_entries);
+ if (_doDurOps) {
+ e.op = op;
+ }
+ return;
+ }
+
+ case JEntry::OpCode_DbContext: {
+ _lastDbName = (const char*) _entries->pos();
+ const unsigned limit = std::min((unsigned)Namespace::MaxNsLen, _entries->remaining());
+ const unsigned len = strnlen(_lastDbName, limit);
+ massert(13533, "problem processing journal file during recovery", _lastDbName[len] == '\0');
+ _entries->skip(len+1); // skip '\0' too
+ _entries->read(lenOrOpCode); // read this for the fall through
+ }
+ // fall through as a basic operation always follows jdbcontext, and we don't have anything to return yet
+
+ default:
+ // fall through
+ ;
+ }
+ }
+
+ // JEntry - a basic write
+ assert( lenOrOpCode && lenOrOpCode < JEntry::OpCode_Min );
+ _entries->rewind(4);
+ e.e = (JEntry *) _entries->skip(sizeof(JEntry));
+ e.dbName = e.e->isLocalDbContext() ? "local" : _lastDbName;
+ assert( e.e->len == lenOrOpCode );
+ _entries->skip(e.e->len);
+ }
+
+ };
+
+ static string fileName(const char* dbName, int fileNo) {
+ stringstream ss;
+ ss << dbName << '.';
+ assert( fileNo >= 0 );
+ if( fileNo == JEntry::DotNsSuffix )
+ ss << "ns";
+ else
+ ss << fileNo;
+
+ // relative name -> full path name
+ path full(dbpath);
+ full /= ss.str();
+ return full.string();
+ }
+
+ RecoveryJob::~RecoveryJob() {
+ DESTRUCTOR_GUARD(
+ if( !_mmfs.empty() )
+ close();
+ )
+ }
+
+ void RecoveryJob::close() {
+ scoped_lock lk(_mx);
+ _close();
+ }
+
+ void RecoveryJob::_close() {
+ MongoFile::flushAll(true);
+ _mmfs.clear();
+ }
+
+ void RecoveryJob::write(const ParsedJournalEntry& entry) {
+ //TODO(mathias): look into making some of these dasserts
+ assert(entry.e);
+ assert(entry.dbName);
+ assert(strnlen(entry.dbName, MaxDatabaseNameLen) < MaxDatabaseNameLen);
+
+ const string fn = fileName(entry.dbName, entry.e->getFileNo());
+ MongoFile* file;
+ {
+ MongoFileFinder finder; // must release lock before creating new MongoMMF
+ file = finder.findByPath(fn);
+ }
+
+ MongoMMF* mmf;
+ if (file) {
+ assert(file->isMongoMMF());
+ mmf = (MongoMMF*)file;
+ }
+ else {
+ if( !_recovering ) {
+ log() << "journal error applying writes, file " << fn << " is not open" << endl;
+ assert(false);
+ }
+ boost::shared_ptr<MongoMMF> sp (new MongoMMF);
+ assert(sp->open(fn, false));
+ _mmfs.push_back(sp);
+ mmf = sp.get();
+ }
+
+ if ((entry.e->ofs + entry.e->len) <= mmf->length()) {
+ assert(mmf->view_write());
+ assert(entry.e->srcData());
+
+ void* dest = (char*)mmf->view_write() + entry.e->ofs;
+ memcpy(dest, entry.e->srcData(), entry.e->len);
+ stats.curr->_writeToDataFilesBytes += entry.e->len;
+ }
+ else {
+ massert(13622, "Trying to write past end of file in WRITETODATAFILES", _recovering);
+ }
+ }
+
+ void RecoveryJob::applyEntry(const ParsedJournalEntry& entry, bool apply, bool dump) {
+ if( entry.e ) {
+ if( dump ) {
+ stringstream ss;
+ ss << " BASICWRITE " << setw(20) << entry.dbName << '.';
+ if( entry.e->isNsSuffix() )
+ ss << "ns";
+ else
+ ss << setw(2) << entry.e->getFileNo();
+ ss << ' ' << setw(6) << entry.e->len << ' ' << /*hex << setw(8) << (size_t) fqe.srcData << dec <<*/
+ " " << hexdump(entry.e->srcData(), entry.e->len);
+ log() << ss.str() << endl;
+ }
+ if( apply ) {
+ write(entry);
+ }
+ }
+ else if(entry.op) {
+ // a DurOp subclass operation
+ if( dump ) {
+ log() << " OP " << entry.op->toString() << endl;
+ }
+ if( apply ) {
+ if( entry.op->needFilesClosed() ) {
+ _close(); // locked in processSection
+ }
+ entry.op->replay();
+ }
+ }
+ }
+
+ void RecoveryJob::applyEntries(const vector<ParsedJournalEntry> &entries) {
+ bool apply = (cmdLine.durOptions & CmdLine::DurScanOnly) == 0;
+ bool dump = cmdLine.durOptions & CmdLine::DurDumpJournal;
+ if( dump )
+ log() << "BEGIN section" << endl;
+
+ for( vector<ParsedJournalEntry>::const_iterator i = entries.begin(); i != entries.end(); ++i ) {
+ applyEntry(*i, apply, dump);
+ }
+
+ if( dump )
+ log() << "END section" << endl;
+ }
+
+ void RecoveryJob::processSection(const JSectHeader *h, const void *p, unsigned len, const JSectFooter *f) {
+ scoped_lock lk(_mx);
+ RACECHECK
+
+ /** todo: we should really verify the checksum to see that seqNumber is ok?
+ that is expensive maybe there is some sort of checksum of just the header
+ within the header itself
+ */
+ if( _recovering && _lastDataSyncedFromLastRun > h->seqNumber + ExtraKeepTimeMs ) {
+ if( h->seqNumber != _lastSeqMentionedInConsoleLog ) {
+ static int n;
+ if( ++n < 10 ) {
+ log() << "recover skipping application of section seq:" << h->seqNumber << " < lsn:" << _lastDataSyncedFromLastRun << endl;
+ }
+ else if( n == 10 ) {
+ log() << "recover skipping application of section more..." << endl;
+ }
+ _lastSeqMentionedInConsoleLog = h->seqNumber;
+ }
+ return;
+ }
+
+ auto_ptr<JournalSectionIterator> i;
+ if( _recovering ) {
+ i = auto_ptr<JournalSectionIterator>(new JournalSectionIterator(*h, p, len, _recovering));
+ }
+ else {
+ i = auto_ptr<JournalSectionIterator>(new JournalSectionIterator(*h, /*after header*/p, /*w/out header*/len));
+ }
+
+ // we use a static so that we don't have to reallocate every time through. occasionally we
+ // go back to a small allocation so that if there were a spiky growth it won't stick forever.
+ static vector<ParsedJournalEntry> entries;
+ entries.clear();
+/** TEMP uncomment
+ RARELY OCCASIONALLY {
+ if( entries.capacity() > 2048 ) {
+ entries.shrink_to_fit();
+ entries.reserve(2048);
+ }
+ }
+*/
+
+ // first read all entries to make sure this section is valid
+ ParsedJournalEntry e;
+ while( !i->atEof() ) {
+ i->next(e);
+ entries.push_back(e);
+ }
+
+ // after the entries check the footer checksum
+ if( _recovering ) {
+ assert( ((const char *)h) + sizeof(JSectHeader) == p );
+ if( !f->checkHash(h, len + sizeof(JSectHeader)) ) {
+ msgasserted(13594, "journal checksum doesn't match");
+ }
+ }
+
+ // got all the entries for one group commit. apply them:
+ applyEntries(entries);
+ }
+
+ /** apply a specific journal file, that is already mmap'd
+ @param p start of the memory mapped file
+ @return true if this is detected to be the last file (ends abruptly)
+ */
+ bool RecoveryJob::processFileBuffer(const void *p, unsigned len) {
+ try {
+ unsigned long long fileId;
+ BufReader br(p,len);
+
+ {
+ // read file header
+ JHeader h;
+ br.read(h);
+
+ /* [dm] not automatically handled. we should eventually handle this automatically. i think:
+ (1) if this is the final journal file
+ (2) and the file size is just the file header in length (or less) -- this is a bit tricky to determine if prealloced
+ then can just assume recovery ended cleanly and not error out (still should log).
+ */
+ uassert(13537,
+ "journal file header invalid. This could indicate corruption in a journal file, or perhaps a crash where sectors in file header were in flight written out of order at time of crash (unlikely but possible).",
+ h.valid());
+
+ if( !h.versionOk() ) {
+ log() << "journal file version number mismatch got:" << hex << h._version
+ << " expected:" << hex << (unsigned) JHeader::CurrentVersion
+ << ". if you have just upgraded, recover with old version of mongod, terminate cleanly, then upgrade."
+ << endl;
+ uasserted(13536, str::stream() << "journal version number mismatch " << h._version);
+ }
+ fileId = h.fileId;
+ if(cmdLine.durOptions & CmdLine::DurDumpJournal) {
+ log() << "JHeader::fileId=" << fileId << endl;
+ }
+ }
+
+ // read sections
+ while ( !br.atEof() ) {
+ JSectHeader h;
+ br.peek(h);
+ if( h.fileId != fileId ) {
+ if( debug || (cmdLine.durOptions & CmdLine::DurDumpJournal) ) {
+ log() << "Ending processFileBuffer at differing fileId want:" << fileId << " got:" << h.fileId << endl;
+ log() << " sect len:" << h.sectionLen() << " seqnum:" << h.seqNumber << endl;
+ }
+ return true;
+ }
+ unsigned slen = h.sectionLen();
+ unsigned dataLen = slen - sizeof(JSectHeader) - sizeof(JSectFooter);
+ const char *hdr = (const char *) br.skip(h.sectionLenWithPadding());
+ const char *data = hdr + sizeof(JSectHeader);
+ const char *footer = data + dataLen;
+ processSection((const JSectHeader*) hdr, data, dataLen, (const JSectFooter*) footer);
+
+ // ctrl c check
+ killCurrentOp.checkForInterrupt(false);
+ }
+ }
+ catch( BufReader::eof& ) {
+ if( cmdLine.durOptions & CmdLine::DurDumpJournal )
+ log() << "ABRUPT END" << endl;
+ return true; // abrupt end
+ }
+
+ return false; // non-abrupt end
+ }
+
+ /** apply a specific journal file */
+ bool RecoveryJob::processFile(path journalfile) {
+ log() << "recover " << journalfile.string() << endl;
+
+ try {
+ if( boost::filesystem::file_size( journalfile.string() ) == 0 ) {
+ log() << "recover info " << journalfile.string() << " has zero length" << endl;
+ return true;
+ }
+ } catch(...) {
+ // if something weird like a permissions problem keep going so the massert down below can happen (presumably)
+ log() << "recover exception checking filesize" << endl;
+ }
+
+ MemoryMappedFile f;
+ void *p = f.mapWithOptions(journalfile.string().c_str(), MongoFile::READONLY | MongoFile::SEQUENTIAL);
+ massert(13544, str::stream() << "recover error couldn't open " << journalfile.string(), p);
+ return processFileBuffer(p, (unsigned) f.length());
+ }
+
+ /** @param files all the j._0 style files we need to apply for recovery */
+ void RecoveryJob::go(vector<path>& files) {
+ log() << "recover begin" << endl;
+ _recovering = true;
+
+ // load the last sequence number synced to the datafiles on disk before the last crash
+ _lastDataSyncedFromLastRun = journalReadLSN();
+ log() << "recover lsn: " << _lastDataSyncedFromLastRun << endl;
+
+ for( unsigned i = 0; i != files.size(); ++i ) {
+ bool abruptEnd = processFile(files[i]);
+ if( abruptEnd && i+1 < files.size() ) {
+ log() << "recover error: abrupt end to file " << files[i].string() << ", yet it isn't the last journal file" << endl;
+ close();
+ uasserted(13535, "recover abrupt journal file end");
+ }
+ }
+
+ close();
+
+ if( cmdLine.durOptions & CmdLine::DurScanOnly ) {
+ uasserted(13545, str::stream() << "--durOptions " << (int) CmdLine::DurScanOnly << " (scan only) specified");
+ }
+
+ log() << "recover cleaning up" << endl;
+ removeJournalFiles();
+ log() << "recover done" << endl;
+ okToCleanUp = true;
+ _recovering = false;
+ }
+
+ void _recover() {
+ assert( cmdLine.dur );
+
+ boost::filesystem::path p = getJournalDir();
+ if( !exists(p) ) {
+ log() << "directory " << p.string() << " does not exist, there will be no recovery startup step" << endl;
+ okToCleanUp = true;
+ return;
+ }
+
+ vector<path> journalFiles;
+ getFiles(p, journalFiles);
+
+ if( journalFiles.empty() ) {
+ log() << "recover : no journal files present, no recovery needed" << endl;
+ okToCleanUp = true;
+ return;
+ }
+
+ RecoveryJob::get().go(journalFiles);
+ }
+
+ extern mutex groupCommitMutex;
+
+ /** recover from a crash
+ called during startup
+ throws on error
+ */
+ void recover() {
+ // we use a lock so that exitCleanly will wait for us
+ // to finish (or at least to notice what is up and stop)
+ writelock lk;
+
+ // this is so the mutexdebugger doesn't get confused. we are actually single threaded
+ // at this point in the program so it wouldn't have been a true problem (I think)
+ scoped_lock lk2(groupCommitMutex);
+
+ _recover(); // throws on interruption
+ }
+
+ struct BufReaderY { int a,b; };
+ class BufReaderUnitTest : public UnitTest {
+ public:
+ void run() {
+ BufReader r((void*) "abcdabcdabcd", 12);
+ char x;
+ BufReaderY y;
+ r.read(x); //cout << x; // a
+ assert( x == 'a' );
+ r.read(y);
+ r.read(x);
+ assert( x == 'b' );
+ }
+ } brunittest;
+
+ // can't free at termination because order of destruction of global vars is arbitrary
+ RecoveryJob &RecoveryJob::_instance = *(new RecoveryJob());
+
+ } // namespace dur
+
+} // namespace mongo
+
diff --git a/src/mongo/db/dur_recover.h b/src/mongo/db/dur_recover.h
new file mode 100644
index 00000000000..955e730ea05
--- /dev/null
+++ b/src/mongo/db/dur_recover.h
@@ -0,0 +1,50 @@
+// @file dur.h durability support
+
+#pragma once
+
+#include "dur_journalformat.h"
+#include "../util/concurrency/mutex.h"
+#include "../util/file.h"
+
+namespace mongo {
+ class MongoMMF;
+
+ namespace dur {
+ struct ParsedJournalEntry;
+
+ /** call go() to execute a recovery from existing journal files.
+ */
+ class RecoveryJob : boost::noncopyable {
+ public:
+ RecoveryJob() : _lastDataSyncedFromLastRun(0),
+ _mx("recovery"), _recovering(false) { _lastSeqMentionedInConsoleLog = 1; }
+ void go(vector<path>& files);
+ ~RecoveryJob();
+
+ /** @param data data between header and footer. compressed if recovering. */
+ void processSection(const JSectHeader *h, const void *data, unsigned len, const JSectFooter *f);
+
+ void close(); // locks and calls _close()
+
+ static RecoveryJob & get() { return _instance; }
+ private:
+ void write(const ParsedJournalEntry& entry); // actually writes to the file
+ void applyEntry(const ParsedJournalEntry& entry, bool apply, bool dump);
+ void applyEntries(const vector<ParsedJournalEntry> &entries);
+ bool processFileBuffer(const void *, unsigned len);
+ bool processFile(path journalfile);
+ void _close(); // doesn't lock
+
+ list<boost::shared_ptr<MongoMMF> > _mmfs;
+
+ unsigned long long _lastDataSyncedFromLastRun;
+ unsigned long long _lastSeqMentionedInConsoleLog;
+ public:
+ mongo::mutex _mx; // protects _mmfs; see setNoJournal() too
+ private:
+ bool _recovering; // are we in recovery or WRITETODATAFILES
+
+ static RecoveryJob &_instance;
+ };
+ }
+}
diff --git a/src/mongo/db/dur_stats.h b/src/mongo/db/dur_stats.h
new file mode 100644
index 00000000000..50a26d1f215
--- /dev/null
+++ b/src/mongo/db/dur_stats.h
@@ -0,0 +1,49 @@
+// @file dur_stats.h
+
+namespace mongo {
+ namespace dur {
+
+ /** journaling stats. the model here is that the commit thread is the only writer, and that reads are
+ uncommon (from a serverStatus command and such). Thus, there should not be multicore chatter overhead.
+ */
+ struct Stats {
+ Stats();
+ void rotate();
+ BSONObj asObj();
+ unsigned _intervalMicros;
+ struct S {
+ BSONObj _asObj();
+ string _asCSV();
+ string _CSVHeader();
+ void reset();
+
+ unsigned _commits;
+ unsigned _earlyCommits; // count of early commits from commitIfNeeded() or from getDur().commitNow()
+ unsigned long long _journaledBytes;
+ unsigned long long _uncompressedBytes;
+ unsigned long long _writeToDataFilesBytes;
+
+ unsigned long long _prepLogBufferMicros;
+ unsigned long long _writeToJournalMicros;
+ unsigned long long _writeToDataFilesMicros;
+ unsigned long long _remapPrivateViewMicros;
+
+ // undesirable to be in write lock for the group commit (it can be done in a read lock), so good if we
+ // have visibility when this happens. can happen for a couple reasons
+ // - read lock starvation
+ // - file being closed
+ // - data being written faster than the normal group commit interval
+ unsigned _commitsInWriteLock;
+
+ unsigned _dtMillis;
+ };
+ S *curr;
+ private:
+ S _a,_b;
+ unsigned long long _lastRotate;
+ S* other();
+ };
+ extern Stats stats;
+
+ }
+}
diff --git a/src/mongo/db/dur_writetodatafiles.cpp b/src/mongo/db/dur_writetodatafiles.cpp
new file mode 100644
index 00000000000..d77b0482c20
--- /dev/null
+++ b/src/mongo/db/dur_writetodatafiles.cpp
@@ -0,0 +1,94 @@
+// @file dur_writetodatafiles.cpp apply the writes back to the non-private MMF after they are for certain in redo log
+
+/**
+* Copyright (C) 2009 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "dur_commitjob.h"
+#include "dur_stats.h"
+#include "dur_recover.h"
+#include "../util/timer.h"
+
+namespace mongo {
+ namespace dur {
+
+ void debugValidateAllMapsMatch();
+
+ static void WRITETODATAFILES_Impl1(const JSectHeader& h, AlignedBuilder& uncompressed) {
+ LockMongoFilesShared lk;
+ LOG(3) << "journal WRITETODATAFILES 1" << endl;
+ RecoveryJob::get().processSection(&h, uncompressed.buf(), uncompressed.len(), 0);
+ LOG(3) << "journal WRITETODATAFILES 2" << endl;
+ }
+
+#if 0
+ // the old implementation. doesn't work with groupCommitWithLimitedLocks()
+ void WRITETODATAFILES_Impl2() {
+ /* we go backwards as what is at the end is most likely in the cpu cache. it won't be much, but we'll take it. */
+ for( set<WriteIntent>::const_iterator it(commitJob.writes().begin()), end(commitJob.writes().end()); it != end; ++it ) {
+ const WriteIntent& intent = *it;
+ stats.curr->_writeToDataFilesBytes += intent.length();
+ dassert(intent.w_ptr);
+ memcpy(intent.w_ptr, intent.start(), intent.length());
+ }
+ }
+#endif
+
+#if defined(_EXPERIMENTAL)
+ // doesn't work with groupCommitWithLimitedLocks()
+ void WRITETODATAFILES_Impl3() {
+ /* we go backwards as what is at the end is most likely in the cpu cache. it won't be much, but we'll take it. */
+ for( set<WriteIntent>::const_iterator it(commitJob.writes().begin()), end(commitJob.writes().end()); it != end; ++it ) {
+ const WriteIntent& intent = *it;
+ stats.curr->_writeToDataFilesBytes += intent.length();
+ dassert(intent.w_ptr);
+ memcpy(intent.w_ptr,
+ commitJob._ab.atOfs(intent.ofsInJournalBuffer),
+ intent.length());
+ }
+ }
+#endif
+
+ /** apply the writes back to the non-private MMF after they are for certain in redo log
+
+ (1) todo we don't need to write back everything every group commit. we MUST write back
+ that which is going to be a remapped on its private view - but that might not be all
+ views.
+
+ (2) todo should we do this using N threads? would be quite easy
+ see Hackenberg paper table 5 and 6. 2 threads might be a good balance.
+
+ (3) with enough work, we could do this outside the read lock. it's a bit tricky though.
+ - we couldn't do it from the private views then as they may be changing. would have to then
+ be from the journal alignedbuffer.
+ - we need to be careful the file isn't unmapped on us -- perhaps a mutex or something
+ with MongoMMF on closes or something to coordinate that.
+
+ concurrency: in mmmutex, not necessarily in dbMutex
+
+ @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc&hl=en
+ */
+
+ void WRITETODATAFILES(const JSectHeader& h, AlignedBuilder& uncompressed) {
+ Timer t;
+ WRITETODATAFILES_Impl1(h, uncompressed);
+ unsigned long long m = t.micros();
+ stats.curr->_writeToDataFilesMicros += m;
+ LOG(2) << "journal WRITETODATAFILES " << m / 1000.0 << "ms" << endl;
+ }
+
+ }
+}
diff --git a/src/mongo/db/durop.cpp b/src/mongo/db/durop.cpp
new file mode 100644
index 00000000000..80ee5043410
--- /dev/null
+++ b/src/mongo/db/durop.cpp
@@ -0,0 +1,161 @@
+// @file durop.cpp
+
+/**
+* Copyright (C) 2010 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "concurrency.h"
+#include "../util/alignedbuilder.h"
+#include "../util/mongoutils/str.h"
+#include "../util/file.h"
+#include "mongommf.h"
+#include "durop.h"
+#include "../util/file_allocator.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+
+ extern string dbpath; // --dbpath parm
+
+ void _deleteDataFiles(const char *);
+
+ namespace dur {
+
+ /** read a durop from journal file referenced by br.
+ @param opcode the opcode which has already been written from the bufreader
+ */
+ shared_ptr<DurOp> DurOp::read(unsigned opcode, BufReader& br) {
+ shared_ptr<DurOp> op;
+ switch( opcode ) {
+ case JEntry::OpCode_FileCreated:
+ op = shared_ptr<DurOp>( new FileCreatedOp(br) );
+ break;
+ case JEntry::OpCode_DropDb:
+ op = shared_ptr<DurOp>( new DropDbOp(br) );
+ break;
+ default:
+ massert(13546, (str::stream() << "journal recover: unrecognized opcode in journal " << opcode), false);
+ }
+ return op;
+ }
+
+ void DurOp::serialize(AlignedBuilder& ab) {
+ ab.appendNum(_opcode);
+ _serialize(ab);
+ }
+
+ DropDbOp::DropDbOp(BufReader& log) : DurOp(JEntry::OpCode_DropDb) {
+ unsigned long long reserved;
+ log.read(reserved);
+ log.read(reserved);
+ log.readStr(_db);
+ string reservedStr;
+ log.readStr(reservedStr);
+ }
+
+ void DropDbOp::_serialize(AlignedBuilder& ab) {
+ ab.appendNum((unsigned long long) 0); // reserved for future use
+ ab.appendNum((unsigned long long) 0); // reserved for future use
+ ab.appendStr(_db);
+ ab.appendStr(""); // reserved
+ }
+
+ /** throws */
+ void DropDbOp::replay() {
+ log() << "recover replay drop db " << _db << endl;
+ _deleteDataFiles(_db.c_str());
+ }
+
+ FileCreatedOp::FileCreatedOp(string f, unsigned long long l) :
+ DurOp(JEntry::OpCode_FileCreated) {
+ _p = RelativePath::fromFullPath(f);
+ _len = l;
+ }
+
+ FileCreatedOp::FileCreatedOp(BufReader& log) : DurOp(JEntry::OpCode_FileCreated) {
+ unsigned long long reserved;
+ log.read(reserved);
+ log.read(reserved);
+ log.read(_len); // size of file, not length of name
+ string s;
+ log.readStr(s);
+ _p._p = s;
+ }
+
+ void FileCreatedOp::_serialize(AlignedBuilder& ab) {
+ ab.appendNum((unsigned long long) 0); // reserved for future use
+ ab.appendNum((unsigned long long) 0); // reserved for future use
+ ab.appendNum(_len);
+ ab.appendStr(_p.toString());
+ }
+
+ string FileCreatedOp::toString() {
+ return str::stream() << "FileCreatedOp " << _p.toString() << ' ' << _len/1024.0/1024.0 << "MB";
+ }
+
+ // if an operation deletes or creates a file (or moves etc.), it may need files closed.
+ bool FileCreatedOp::needFilesClosed() {
+ return exists( _p.asFullPath() );
+ }
+
+ void FileCreatedOp::replay() {
+ // i believe the code assumes new files are filled with zeros. thus we have to recreate the file,
+ // or rewrite at least, even if it were the right length. perhaps one day we should change that
+ // although easier to avoid defects if we assume it is zeros perhaps.
+ string full = _p.asFullPath();
+ if( exists(full) ) {
+ try {
+ remove(full);
+ }
+ catch(std::exception& e) {
+ log(1) << "recover info FileCreateOp::replay unlink " << e.what() << endl;
+ }
+ }
+
+ log() << "recover create file " << full << ' ' << _len/1024.0/1024.0 << "MB" << endl;
+ if( MemoryMappedFile::exists(full) ) {
+ // first delete if exists.
+ try {
+ remove(full);
+ }
+ catch(...) {
+ log() << "warning could not delete file " << full << endl;
+ }
+ }
+ ensureParentDirCreated(full);
+ File f;
+ f.open(full.c_str());
+ massert(13547, str::stream() << "recover couldn't create file " << full, f.is_open());
+ unsigned long long left = _len;
+ const unsigned blksz = 64 * 1024;
+ scoped_array<char> v( new char[blksz] );
+ memset( v.get(), 0, blksz );
+ fileofs ofs = 0;
+ while( left ) {
+ unsigned long long w = left < blksz ? left : blksz;
+ f.write(ofs, v.get(), (unsigned) w);
+ left -= w;
+ ofs += w;
+ }
+ f.fsync();
+ flushMyDirectory(full);
+ massert(13628, str::stream() << "recover failure writing file " << full, !f.bad() );
+ }
+
+ }
+
+}
diff --git a/src/mongo/db/durop.h b/src/mongo/db/durop.h
new file mode 100644
index 00000000000..9ab1bfcbede
--- /dev/null
+++ b/src/mongo/db/durop.h
@@ -0,0 +1,109 @@
+// @file durop.h class DurOp and descendants
+
+/**
+* Copyright (C) 2010 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "dur_journalformat.h"
+#include "../util/bufreader.h"
+#include "../util/paths.h"
+
+namespace mongo {
+
+ class AlignedBuilder;
+
+ namespace dur {
+
+ /** DurOp - Operations we journal that aren't just basic writes.
+ *
+ * Basic writes are logged as JEntry's, and indicated in ram temporarily as struct dur::WriteIntent.
+ * We don't make WriteIntent inherit from DurOp to keep it as lean as possible as there will be millions of
+ * them (we don't want a vtable for example there).
+ *
+ * For each op we want to journal, we define a subclass.
+ */
+ class DurOp { /* copyable */
+ public:
+ // @param opcode a sentinel value near max unsigned which uniquely identifies the operation.
+ // @see dur::JEntry
+ DurOp(unsigned opcode) : _opcode(opcode) { }
+
+ virtual ~DurOp() { }
+
+ /** serialize the op out to a builder which will then be written (presumably) to the journal */
+ void serialize(AlignedBuilder& ab);
+
+ /** read a durop from journal file referenced by br.
+ @param opcode the opcode which has already been written from the bufreader
+ */
+ static shared_ptr<DurOp> read(unsigned opcode, BufReader& br);
+
+ /** replay the operation (during recovery)
+ throws
+
+ For now, these are not replayed during the normal WRITETODATAFILES phase, since these
+ operations are handled in other parts of the code. At some point this may change.
+ */
+ virtual void replay() = 0;
+
+ virtual string toString() = 0;
+
+ /** if the op requires all file to be closed before doing its work, returns true. */
+ virtual bool needFilesClosed() { return false; }
+
+ protected:
+ /** DurOp will have already written the opcode for you */
+ virtual void _serialize(AlignedBuilder& ab) = 0;
+
+ private:
+ const unsigned _opcode;
+ };
+
+ /** indicates creation of a new file */
+ class FileCreatedOp : public DurOp {
+ public:
+ FileCreatedOp(BufReader& log);
+ /** param f filename to create with path */
+ FileCreatedOp(string f, unsigned long long l);
+ virtual void replay();
+ virtual string toString();
+ virtual bool needFilesClosed();
+ protected:
+ virtual void _serialize(AlignedBuilder& ab);
+ private:
+ RelativePath _p;
+ unsigned long long _len; // size of file, not length of name
+ };
+
+ /** record drop of a database */
+ class DropDbOp : public DurOp {
+ public:
+ DropDbOp(BufReader& log);
+ DropDbOp(string db) :
+ DurOp(JEntry::OpCode_DropDb), _db(db) { }
+ virtual void replay();
+ virtual string toString() { return string("DropDbOp ") + _db; }
+ virtual bool needFilesClosed() { return true; }
+ protected:
+ virtual void _serialize(AlignedBuilder& ab);
+ private:
+ string _db;
+ };
+
+ }
+
+}
diff --git a/src/mongo/db/extsort.cpp b/src/mongo/db/extsort.cpp
new file mode 100644
index 00000000000..06a9756cc0a
--- /dev/null
+++ b/src/mongo/db/extsort.cpp
@@ -0,0 +1,245 @@
+// extsort.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "extsort.h"
+#include "namespace-inl.h"
+#include "../util/file.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+namespace mongo {
+
+ IndexInterface *BSONObjExternalSorter::extSortIdxInterface;
+ Ordering BSONObjExternalSorter::extSortOrder( Ordering::make(BSONObj()) );
+ unsigned long long BSONObjExternalSorter::_compares = 0;
+
+ BSONObjExternalSorter::BSONObjExternalSorter( IndexInterface &i, const BSONObj & order , long maxFileSize )
+ : _idxi(i), _order( order.getOwned() ) , _maxFilesize( maxFileSize ) ,
+ _arraySize(1000000), _cur(0), _curSizeSoFar(0), _sorted(0) {
+
+ stringstream rootpath;
+ rootpath << dbpath;
+ if ( dbpath[dbpath.size()-1] != '/' )
+ rootpath << "/";
+ rootpath << "_tmp/esort." << time(0) << "." << rand() << "/";
+ _root = rootpath.str();
+
+ log(1) << "external sort root: " << _root.string() << endl;
+
+ create_directories( _root );
+ _compares = 0;
+ }
+
+ BSONObjExternalSorter::~BSONObjExternalSorter() {
+ if ( _cur ) {
+ delete _cur;
+ _cur = 0;
+ }
+ unsigned long removed = remove_all( _root );
+ wassert( removed == 1 + _files.size() );
+ }
+
+ void BSONObjExternalSorter::_sortInMem() {
+ // extSortComp needs to use glbals
+ // qsort_r only seems available on bsd, which is what i really want to use
+ dblock l;
+ extSortIdxInterface = &_idxi;
+ extSortOrder = Ordering::make(_order);
+ _cur->sort( BSONObjExternalSorter::extSortComp );
+ }
+
+ void BSONObjExternalSorter::sort() {
+ uassert( 10048 , "already sorted" , ! _sorted );
+
+ _sorted = true;
+
+ if ( _cur && _files.size() == 0 ) {
+ _sortInMem();
+ log(1) << "\t\t not using file. size:" << _curSizeSoFar << " _compares:" << _compares << endl;
+ return;
+ }
+
+ if ( _cur ) {
+ finishMap();
+ }
+
+ if ( _cur ) {
+ delete _cur;
+ _cur = 0;
+ }
+
+ if ( _files.size() == 0 )
+ return;
+
+ }
+
+ void BSONObjExternalSorter::add( const BSONObj& o , const DiskLoc & loc ) {
+ uassert( 10049 , "sorted already" , ! _sorted );
+
+ if ( ! _cur ) {
+ _cur = new InMemory( _arraySize );
+ }
+
+ Data& d = _cur->getNext();
+ d.first = o.getOwned();
+ d.second = loc;
+
+ long size = o.objsize();
+ _curSizeSoFar += size + sizeof( DiskLoc ) + sizeof( BSONObj );
+
+ if ( _cur->hasSpace() == false || _curSizeSoFar > _maxFilesize ) {
+ finishMap();
+ log(1) << "finishing map" << endl;
+ }
+
+ }
+
+ void BSONObjExternalSorter::finishMap() {
+ uassert( 10050 , "bad" , _cur );
+
+ _curSizeSoFar = 0;
+ if ( _cur->size() == 0 )
+ return;
+
+ _sortInMem();
+
+ stringstream ss;
+ ss << _root.string() << "/file." << _files.size();
+ string file = ss.str();
+
+ // todo: it may make sense to fadvise that this not be cached so that building the index doesn't
+ // eject other things the db is using from the file system cache. while we will soon be reading
+ // this back, if it fit in ram, there wouldn't have been a need for an external sort in the first
+ // place.
+
+ ofstream out;
+ out.open( file.c_str() , ios_base::out | ios_base::binary );
+ assertStreamGood( 10051 , (string)"couldn't open file: " + file , out );
+
+ int num = 0;
+ for ( InMemory::iterator i=_cur->begin(); i != _cur->end(); ++i ) {
+ Data p = *i;
+ out.write( p.first.objdata() , p.first.objsize() );
+ out.write( (char*)(&p.second) , sizeof( DiskLoc ) );
+ num++;
+ }
+
+ _cur->clear();
+
+ _files.push_back( file );
+ out.close();
+
+ log(2) << "Added file: " << file << " with " << num << "objects for external sort" << endl;
+ }
+
+ // ---------------------------------
+
+ BSONObjExternalSorter::Iterator::Iterator( BSONObjExternalSorter * sorter ) :
+ _cmp( sorter->_idxi, sorter->_order ) , _in( 0 ) {
+
+ for ( list<string>::iterator i=sorter->_files.begin(); i!=sorter->_files.end(); i++ ) {
+ _files.push_back( new FileIterator( *i ) );
+ _stash.push_back( pair<Data,bool>( Data( BSONObj() , DiskLoc() ) , false ) );
+ }
+
+ if ( _files.size() == 0 && sorter->_cur ) {
+ _in = sorter->_cur;
+ _it = sorter->_cur->begin();
+ }
+ }
+
+ BSONObjExternalSorter::Iterator::~Iterator() {
+ for ( vector<FileIterator*>::iterator i=_files.begin(); i!=_files.end(); i++ )
+ delete *i;
+ _files.clear();
+ }
+
+ bool BSONObjExternalSorter::Iterator::more() {
+
+ if ( _in )
+ return _it != _in->end();
+
+ for ( vector<FileIterator*>::iterator i=_files.begin(); i!=_files.end(); i++ )
+ if ( (*i)->more() )
+ return true;
+ for ( vector< pair<Data,bool> >::iterator i=_stash.begin(); i!=_stash.end(); i++ )
+ if ( i->second )
+ return true;
+ return false;
+ }
+
+ BSONObjExternalSorter::Data BSONObjExternalSorter::Iterator::next() {
+
+ if ( _in ) {
+ Data& d = *_it;
+ ++_it;
+ return d;
+ }
+
+ Data best;
+ int slot = -1;
+
+ for ( unsigned i=0; i<_stash.size(); i++ ) {
+
+ if ( ! _stash[i].second ) {
+ if ( _files[i]->more() )
+ _stash[i] = pair<Data,bool>( _files[i]->next() , true );
+ else
+ continue;
+ }
+
+ if ( slot == -1 || _cmp( best , _stash[i].first ) == 0 ) {
+ best = _stash[i].first;
+ slot = i;
+ }
+
+ }
+
+ assert( slot >= 0 );
+ _stash[slot].second = false;
+
+ return best;
+ }
+
+ // -----------------------------------
+
+ BSONObjExternalSorter::FileIterator::FileIterator( string file ) {
+ unsigned long long length;
+ _buf = (char*)_file.map( file.c_str() , length , MemoryMappedFile::SEQUENTIAL );
+ massert( 10308 , "mmap failed" , _buf );
+ assert( length == (unsigned long long) file_size( file ) );
+ _end = _buf + length;
+ }
+ BSONObjExternalSorter::FileIterator::~FileIterator() {}
+
+ bool BSONObjExternalSorter::FileIterator::more() {
+ return _buf < _end;
+ }
+
+ BSONObjExternalSorter::Data BSONObjExternalSorter::FileIterator::next() {
+ BSONObj o( _buf );
+ _buf += o.objsize();
+ DiskLoc * l = (DiskLoc*)_buf;
+ _buf += 8;
+ return Data( o , *l );
+ }
+
+}
diff --git a/src/mongo/db/extsort.h b/src/mongo/db/extsort.h
new file mode 100644
index 00000000000..15a6d441849
--- /dev/null
+++ b/src/mongo/db/extsort.h
@@ -0,0 +1,150 @@
+// extsort.h
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "jsobj.h"
+#include "namespace-inl.h"
+#include "curop-inl.h"
+#include "../util/array.h"
+
+namespace mongo {
+
+ /**
+ for external (disk) sorting by BSONObj and attaching a value
+ */
+ class BSONObjExternalSorter : boost::noncopyable {
+ public:
+ BSONObjExternalSorter( IndexInterface &i, const BSONObj & order = BSONObj() , long maxFileSize = 1024 * 1024 * 100 );
+ ~BSONObjExternalSorter();
+ typedef pair<BSONObj,DiskLoc> Data;
+
+ private:
+ IndexInterface& _idxi;
+
+ static int _compare(IndexInterface& i, const Data& l, const Data& r, const Ordering& order) {
+ RARELY killCurrentOp.checkForInterrupt();
+ _compares++;
+ int x = i.keyCompare(l.first, r.first, order);
+ if ( x )
+ return x;
+ return l.second.compare( r.second );
+ }
+
+ class MyCmp {
+ public:
+ MyCmp( IndexInterface& i, BSONObj order = BSONObj() ) : _i(i), _order( Ordering::make(order) ) {}
+ bool operator()( const Data &l, const Data &r ) const {
+ return _compare(_i, l, r, _order) < 0;
+ };
+ private:
+ IndexInterface& _i;
+ const Ordering _order;
+ };
+
+ static IndexInterface *extSortIdxInterface;
+ static Ordering extSortOrder;
+ static int extSortComp( const void *lv, const void *rv ) {
+ DEV RARELY {
+ d.dbMutex.assertWriteLocked(); // must be as we use a global var
+ }
+ Data * l = (Data*)lv;
+ Data * r = (Data*)rv;
+ return _compare(*extSortIdxInterface, *l, *r, extSortOrder);
+ };
+
+ class FileIterator : boost::noncopyable {
+ public:
+ FileIterator( string file );
+ ~FileIterator();
+ bool more();
+ Data next();
+ private:
+ MemoryMappedFile _file;
+ char * _buf;
+ char * _end;
+ };
+
+ public:
+
+ typedef FastArray<Data> InMemory;
+
+ class Iterator : boost::noncopyable {
+ public:
+
+ Iterator( BSONObjExternalSorter * sorter );
+ ~Iterator();
+ bool more();
+ Data next();
+
+ private:
+ MyCmp _cmp;
+ vector<FileIterator*> _files;
+ vector< pair<Data,bool> > _stash;
+
+ InMemory * _in;
+ InMemory::iterator _it;
+
+ };
+
+ void add( const BSONObj& o , const DiskLoc & loc );
+ void add( const BSONObj& o , int a , int b ) {
+ add( o , DiskLoc( a , b ) );
+ }
+
+ /* call after adding values, and before fetching the iterator */
+ void sort();
+
+ auto_ptr<Iterator> iterator() {
+ uassert( 10052 , "not sorted" , _sorted );
+ return auto_ptr<Iterator>( new Iterator( this ) );
+ }
+
+ int numFiles() {
+ return _files.size();
+ }
+
+ long getCurSizeSoFar() { return _curSizeSoFar; }
+
+ void hintNumObjects( long long numObjects ) {
+ if ( numObjects < _arraySize )
+ _arraySize = (int)(numObjects + 100);
+ }
+
+ private:
+
+ void _sortInMem();
+
+ void sort( string file );
+ void finishMap();
+
+ BSONObj _order;
+ long _maxFilesize;
+ path _root;
+
+ int _arraySize;
+ InMemory * _cur;
+ long _curSizeSoFar;
+
+ list<string> _files;
+ bool _sorted;
+
+ static unsigned long long _compares;
+ };
+}
diff --git a/src/mongo/db/filever.h b/src/mongo/db/filever.h
new file mode 100644
index 00000000000..e89a8243dcf
--- /dev/null
+++ b/src/mongo/db/filever.h
@@ -0,0 +1,30 @@
+/* filever.h */
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+
+ inline void checkDataFileVersion(NamespaceDetails& d) {
+ }
+
+ inline void checkIndexFileVersion(NamespaceDetails& d) {
+ }
+
+}
+
diff --git a/src/mongo/db/flushtest.cpp b/src/mongo/db/flushtest.cpp
new file mode 100644
index 00000000000..2009d922950
--- /dev/null
+++ b/src/mongo/db/flushtest.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) 2010 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include <stdio.h>
+#include "../util/goodies.h"
+#include <fcntl.h>
+
+namespace mongo {
+
+#if defined(F_FULLFSYNC)
+ void fullsync(int f) {
+ fcntl( f, F_FULLFSYNC );
+ }
+#else
+ void fullsync(int f) {
+ fdatasync(f);
+ }
+#endif
+
+ int main(int argc, char* argv[], char *envp[] ) {
+ cout << "hello" << endl;
+
+ FILE *f = fopen("/data/db/temptest", "a");
+
+ if ( f == 0 ) {
+ cout << "can't open file\n";
+ return 1;
+ }
+
+ {
+ Timer t;
+ for ( int i = 0; i < 50000; i++ )
+ fwrite("abc", 3, 1, f);
+ cout << "small writes: " << t.millis() << "ms" << endl;
+ }
+
+ {
+ Timer t;
+ for ( int i = 0; i < 10000; i++ ) {
+ fwrite("abc", 3, 1, f);
+ fflush(f);
+ fsync( fileno( f ) );
+ }
+ int ms = t.millis();
+ cout << "flush: " << ms << "ms, " << ms / 10000.0 << "ms/request" << endl;
+ }
+
+ {
+ Timer t;
+ for ( int i = 0; i < 500; i++ ) {
+ fwrite("abc", 3, 1, f);
+ fflush(f);
+ fsync( fileno( f ) );
+ sleepmillis(2);
+ }
+ int ms = t.millis() - 500 * 2;
+ cout << "flush with sleeps: " << ms << "ms, " << ms / 500.0 << "ms/request" << endl;
+ }
+
+ char buf[8192];
+ for ( int pass = 0; pass < 2; pass++ ) {
+ cout << "pass " << pass << endl;
+ {
+ Timer t;
+ int n = 500;
+ for ( int i = 0; i < n; i++ ) {
+ if ( pass == 0 )
+ fwrite("abc", 3, 1, f);
+ else
+ fwrite(buf, 8192, 1, f);
+ buf[0]++;
+ fflush(f);
+ fullsync(fileno(f));
+ }
+ int ms = t.millis();
+ cout << "fullsync: " << ms << "ms, " << ms / ((double) n) << "ms/request" << endl;
+ }
+
+ {
+ Timer t;
+ for ( int i = 0; i < 500; i++ ) {
+ if ( pass == 0 )
+ fwrite("abc", 3, 1, f);
+ else
+ fwrite(buf, 8192, 1, f);
+ buf[0]++;
+ fflush(f);
+ fullsync(fileno(f));
+ sleepmillis(2);
+ }
+ int ms = t.millis() - 2 * 500;
+ cout << "fullsync with sleeps: " << ms << "ms, " << ms / 500.0 << "ms/request" << endl;
+ }
+ }
+
+ // without growing
+ {
+ fclose(f);
+ /* try from beginning of the file, where we aren't appending and changing the file length,
+ to see if this is faster as the directory entry then doesn't have to be flushed (if noatime in effect).
+ */
+ f = fopen("/data/db/temptest", "r+");
+ Timer t;
+ int n = 500;
+ for ( int i = 0; i < n; i++ ) {
+ fwrite("xyz", 3, 1, f);
+ fflush(f);
+ fullsync(fileno(f));
+ }
+ int ms = t.millis();
+ cout << "fullsync without growing: " << ms << "ms, " << ms / ((double) n) << "ms/request" << endl;
+ }
+
+ // without growing, with delay
+ {
+ fclose(f);
+ /* try from beginning of the file, where we aren't appending and changing the file length,
+ to see if this is faster as the directory entry then doesn't have to be flushed (if noatime in effect).
+ */
+ f = fopen("/data/db/temptest", "r+");
+ Timer t;
+ int n = 500;
+ for ( int i = 0; i < n; i++ ) {
+ fwrite("xyz", 3, 1, f);
+ fflush(f);
+ fullsync(fileno(f));
+ sleepmillis(2);
+ }
+ int ms = t.millis() - 2 * 500;
+ cout << "fullsync without growing with sleeps: " << ms << "ms, " << ms / ((double) n) << "ms/request" << endl;
+ }
+
+ return 0;
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/geo/2d.cpp b/src/mongo/db/geo/2d.cpp
new file mode 100644
index 00000000000..f05ce4315b2
--- /dev/null
+++ b/src/mongo/db/geo/2d.cpp
@@ -0,0 +1,3289 @@
+// geo2d.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../namespace-inl.h"
+#include "../jsobj.h"
+#include "../index.h"
+#include "../../util/unittest.h"
+#include "../commands.h"
+#include "../pdfile.h"
+#include "../btree.h"
+#include "../curop-inl.h"
+#include "../matcher.h"
+#include "../queryutil.h"
+#include "core.h"
+#include "../../util/timer.h"
+
+// Note: we use indexinterface herein to talk to the btree code. In the future it would be nice to
+// be able to use the V1 key class (see key.h) instead of toBson() which has some cost.
+// toBson() is new with v1 so this could be slower than it used to be? a quick profiling
+// might make sense.
+
+namespace mongo {
+
+ class GeoKeyNode {
+ GeoKeyNode();
+ public:
+ GeoKeyNode( DiskLoc bucket, int keyOfs, DiskLoc r, BSONObj k) : _bucket( bucket ), _keyOfs( keyOfs ), recordLoc(r), _key(k) { }
+ const DiskLoc _bucket;
+ const int _keyOfs;
+ const DiskLoc recordLoc;
+ const BSONObj _key;
+ };
+
+ // just use old indexes for geo for now. todo.
+// typedef BtreeBucket<V0> GeoBtreeBucket;
+// typedef GeoBtreeBucket::KeyNode GeoKeyNode;
+
+//#define BTREE btree<V0>
+
+#if 0
+# define CDEBUG -1
+#else
+# define CDEBUG 10
+#endif
+
+#if 0
+# define GEODEBUGGING
+# define GEODEBUG(x) cout << x << endl;
+# define GEODEBUGPRINT(x) PRINT(x)
+ inline void PREFIXDEBUG(GeoHash prefix, const GeoConvert* g) {
+ if (!prefix.constrains()) {
+ cout << "\t empty prefix" << endl;
+ return ;
+ }
+
+ Point ll (g, prefix); // lower left
+ prefix.move(1,1);
+ Point tr (g, prefix); // top right
+
+ Point center ( (ll._x+tr._x)/2, (ll._y+tr._y)/2 );
+ double radius = fabs(ll._x - tr._x) / 2;
+
+ cout << "\t ll: " << ll.toString() << " tr: " << tr.toString()
+ << " center: " << center.toString() << " radius: " << radius << endl;
+
+ }
+#else
+# define GEODEBUG(x)
+# define GEODEBUGPRINT(x)
+# define PREFIXDEBUG(x, y)
+#endif
+
+ const double EARTH_RADIUS_KM = 6371;
+ const double EARTH_RADIUS_MILES = EARTH_RADIUS_KM * 0.621371192;
+
+ enum GeoDistType {
+ GEO_PLAIN,
+ GEO_SPHERE
+ };
+
+ inline double computeXScanDistance(double y, double maxDistDegrees) {
+ // TODO: this overestimates for large madDistDegrees far from the equator
+ return maxDistDegrees / min(cos(deg2rad(min(+89.0, y + maxDistDegrees))),
+ cos(deg2rad(max(-89.0, y - maxDistDegrees))));
+ }
+
+ GeoBitSets geoBitSets;
+
+ const string GEO2DNAME = "2d";
+
+ class Geo2dType : public IndexType , public GeoConvert {
+ public:
+ virtual ~Geo2dType() { }
+
+ Geo2dType( const IndexPlugin * plugin , const IndexSpec* spec )
+ : IndexType( plugin , spec ) {
+
+ BSONObjBuilder orderBuilder;
+
+ BSONObjIterator i( spec->keyPattern );
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ if ( e.type() == String && GEO2DNAME == e.valuestr() ) {
+ uassert( 13022 , "can't have 2 geo field" , _geo.size() == 0 );
+ uassert( 13023 , "2d has to be first in index" , _other.size() == 0 );
+ _geo = e.fieldName();
+ }
+ else {
+ _other.push_back( e.fieldName() );
+ }
+ orderBuilder.append( "" , 1 );
+ }
+
+ uassert( 13024 , "no geo field specified" , _geo.size() );
+
+ double bits = _configval( spec , "bits" , 26 ); // for lat/long, ~ 1ft
+
+ uassert( 13028 , "bits in geo index must be between 1 and 32" , bits > 0 && bits <= 32 );
+
+ _bits = (unsigned) bits;
+
+ _max = _configval( spec , "max" , 180.0 );
+ _min = _configval( spec , "min" , -180.0 );
+
+ double numBuckets = (1024 * 1024 * 1024 * 4.0);
+
+ _scaling = numBuckets / ( _max - _min );
+
+ _order = orderBuilder.obj();
+
+ GeoHash a(0, 0, _bits);
+ GeoHash b = a;
+ b.move(1, 1);
+
+ // Epsilon is 1/100th of a bucket size
+ // TODO: Can we actually find error bounds for the sqrt function?
+ double epsilon = 0.001 / _scaling;
+ _error = distance(a, b) + epsilon;
+
+ // Error in radians
+ _errorSphere = deg2rad( _error );
+
+ }
+
+ double _configval( const IndexSpec* spec , const string& name , double def ) {
+ BSONElement e = spec->info[name];
+ if ( e.isNumber() ) {
+ return e.numberDouble();
+ }
+ return def;
+ }
+
+ virtual BSONObj fixKey( const BSONObj& in ) {
+ if ( in.firstElement().type() == BinData )
+ return in;
+
+ BSONObjBuilder b(in.objsize()+16);
+
+ if ( in.firstElement().isABSONObj() )
+ _hash( in.firstElement().embeddedObject() ).append( b , "" );
+ else if ( in.firstElement().type() == String )
+ GeoHash( in.firstElement().valuestr() ).append( b , "" );
+ else if ( in.firstElement().type() == RegEx )
+ GeoHash( in.firstElement().regex() ).append( b , "" );
+ else
+ return in;
+
+ BSONObjIterator i(in);
+ i.next();
+ while ( i.more() )
+ b.append( i.next() );
+ return b.obj();
+ }
+
+ /** Finds the key objects to put in an index */
+ virtual void getKeys( const BSONObj& obj, BSONObjSet& keys ) const {
+ getKeys( obj, &keys, NULL );
+ }
+
+ /** Finds all locations in a geo-indexed object */
+ // TODO: Can we just return references to the locs, if they won't change?
+ void getKeys( const BSONObj& obj, vector< BSONObj >& locs ) const {
+ getKeys( obj, NULL, &locs );
+ }
+
+ /** Finds the key objects and/or locations for a geo-indexed object */
+ void getKeys( const BSONObj &obj, BSONObjSet* keys, vector< BSONObj >* locs ) const {
+
+ BSONElementMSet bSet;
+
+ // Get all the nested location fields, but don't return individual elements from
+ // the last array, if it exists.
+ obj.getFieldsDotted(_geo.c_str(), bSet, false);
+
+ if( bSet.empty() )
+ return;
+
+ for( BSONElementMSet::iterator setI = bSet.begin(); setI != bSet.end(); ++setI ) {
+
+ BSONElement geo = *setI;
+
+ GEODEBUG( "Element " << geo << " found for query " << _geo.c_str() );
+
+ if ( geo.eoo() || ! geo.isABSONObj() )
+ continue;
+
+ //
+ // Grammar for location lookup:
+ // locs ::= [loc,loc,...,loc]|{<k>:loc,<k>:loc,...,<k>:loc}|loc
+ // loc ::= { <k1> : #, <k2> : # }|[#, #]|{}
+ //
+ // Empty locations are ignored, preserving single-location semantics
+ //
+
+ BSONObj embed = geo.embeddedObject();
+ if ( embed.isEmpty() )
+ continue;
+
+ // Differentiate between location arrays and locations
+ // by seeing if the first element value is a number
+ bool singleElement = embed.firstElement().isNumber();
+
+ BSONObjIterator oi(embed);
+
+ while( oi.more() ) {
+
+ BSONObj locObj;
+
+ if( singleElement ) locObj = embed;
+ else {
+ BSONElement locElement = oi.next();
+
+ uassert( 13654, str::stream() << "location object expected, location array not in correct format",
+ locElement.isABSONObj() );
+
+ locObj = locElement.embeddedObject();
+
+ if( locObj.isEmpty() )
+ continue;
+ }
+
+ BSONObjBuilder b(64);
+
+ // Remember the actual location object if needed
+ if( locs )
+ locs->push_back( locObj );
+
+ // Stop if we don't need to get anything but location objects
+ if( ! keys ) {
+ if( singleElement ) break;
+ else continue;
+ }
+
+ _hash( locObj ).append( b , "" );
+
+ // Go through all the other index keys
+ for ( vector<string>::const_iterator i = _other.begin(); i != _other.end(); ++i ) {
+
+ // Get *all* fields for the index key
+ BSONElementSet eSet;
+ obj.getFieldsDotted( *i, eSet );
+
+
+ if ( eSet.size() == 0 )
+ b.appendAs( _spec->missingField(), "" );
+ else if ( eSet.size() == 1 )
+ b.appendAs( *(eSet.begin()), "" );
+ else {
+
+ // If we have more than one key, store as an array of the objects
+
+ BSONArrayBuilder aBuilder;
+
+ for( BSONElementSet::iterator ei = eSet.begin(); ei != eSet.end(); ++ei ) {
+ aBuilder.append( *ei );
+ }
+
+ BSONArray arr = aBuilder.arr();
+
+ b.append( "", arr );
+
+ }
+
+ }
+
+ keys->insert( b.obj() );
+
+ if( singleElement ) break;
+
+ }
+ }
+
+ }
+
+ BSONObj _fromBSONHash( const BSONElement& e ) const {
+ return _unhash( _tohash( e ) );
+ }
+
+ BSONObj _fromBSONHash( const BSONObj& o ) const {
+ return _unhash( _tohash( o.firstElement() ) );
+ }
+
+ GeoHash _tohash( const BSONElement& e ) const {
+ if ( e.isABSONObj() )
+ return _hash( e.embeddedObject() );
+
+ return GeoHash( e , _bits );
+ }
+
+ GeoHash _hash( const BSONObj& o ) const {
+ BSONObjIterator i(o);
+ uassert( 13067 , "geo field is empty" , i.more() );
+ BSONElement x = i.next();
+ uassert( 13068 , "geo field only has 1 element" , i.more() );
+ BSONElement y = i.next();
+
+ uassert( 13026 , "geo values have to be numbers: " + o.toString() , x.isNumber() && y.isNumber() );
+
+ return hash( x.number() , y.number() );
+ }
+
+ GeoHash hash( const Point& p ) const {
+ return hash( p._x, p._y );
+ }
+
+ GeoHash hash( double x , double y ) const {
+ return GeoHash( _convert(x), _convert(y) , _bits );
+ }
+
+ BSONObj _unhash( const GeoHash& h ) const {
+ unsigned x , y;
+ h.unhash( x , y );
+ BSONObjBuilder b;
+ b.append( "x" , _unconvert( x ) );
+ b.append( "y" , _unconvert( y ) );
+ return b.obj();
+ }
+
+ unsigned _convert( double in ) const {
+ uassert( 13027 , str::stream() << "point not in interval of [ " << _min << ", " << _max << " )", in < _max && in >= _min );
+ in -= _min;
+ assert( in >= 0 );
+ return (unsigned)(in * _scaling);
+ }
+
+ double _unconvert( unsigned in ) const {
+ double x = in;
+ x /= _scaling;
+ x += _min;
+ return x;
+ }
+
+ void unhash( const GeoHash& h , double& x , double& y ) const {
+ unsigned a,b;
+ h.unhash(a,b);
+ x = _unconvert( a );
+ y = _unconvert( b );
+ }
+
+ double distance( const GeoHash& a , const GeoHash& b ) const {
+ double ax,ay,bx,by;
+ unhash( a , ax , ay );
+ unhash( b , bx , by );
+
+ double dx = bx - ax;
+ double dy = by - ay;
+
+ return sqrt( ( dx * dx ) + ( dy * dy ) );
+ }
+
+ double sizeDiag( const GeoHash& a ) const {
+ GeoHash b = a;
+ b.move( 1 , 1 );
+ return distance( a , b );
+ }
+
+ double sizeEdge( const GeoHash& a ) const {
+
+ if( ! a.constrains() )
+ return _max - _min;
+
+ double ax,ay,bx,by;
+ GeoHash b = a;
+ b.move( 1 , 1 );
+ unhash( a, ax, ay );
+ unhash( b, bx, by );
+
+ // _min and _max are a singularity
+ if (bx == _min)
+ bx = _max;
+
+ return (fabs(ax-bx));
+ }
+
+ const IndexDetails* getDetails() const {
+ return _spec->getDetails();
+ }
+
+ virtual shared_ptr<Cursor> newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const;
+
+ virtual IndexSuitability suitability( const BSONObj& query , const BSONObj& order ) const {
+ BSONElement e = query.getFieldDotted(_geo.c_str());
+ switch ( e.type() ) {
+ case Object: {
+ BSONObj sub = e.embeddedObject();
+ switch ( sub.firstElement().getGtLtOp() ) {
+ case BSONObj::opNEAR:
+ case BSONObj::opWITHIN:
+ return OPTIMAL;
+ default:
+ // We can try to match if there's no other indexing defined,
+ // this is assumed a point
+ return HELPFUL;
+ }
+ }
+ case Array:
+ // We can try to match if there's no other indexing defined,
+ // this is assumed a point
+ return HELPFUL;
+ default:
+ return USELESS;
+ }
+ }
+
+ string _geo;
+ vector<string> _other;
+
+ unsigned _bits;
+ double _max;
+ double _min;
+ double _scaling;
+
+ BSONObj _order;
+ double _error;
+ double _errorSphere;
+ };
+
+ class Box {
+ public:
+
+ Box( const Geo2dType * g , const GeoHash& hash )
+ : _min( g , hash ) ,
+ _max( _min._x + g->sizeEdge( hash ) , _min._y + g->sizeEdge( hash ) ) {
+ }
+
+ Box( double x , double y , double size )
+ : _min( x , y ) ,
+ _max( x + size , y + size ) {
+ }
+
+ Box( Point min , Point max )
+ : _min( min ) , _max( max ) {
+ }
+
+ Box() {}
+
+ BSONArray toBSON() const {
+ return BSON_ARRAY( BSON_ARRAY( _min._x << _min._y ) << BSON_ARRAY( _max._x << _max._y ) );
+ }
+
+ string toString() const {
+ StringBuilder buf(64);
+ buf << _min.toString() << " -->> " << _max.toString();
+ return buf.str();
+ }
+
+ bool between( double min , double max , double val , double fudge=0) const {
+ return val + fudge >= min && val <= max + fudge;
+ }
+
+ bool onBoundary( double bound, double val, double fudge = 0 ) const {
+ return ( val >= bound - fudge && val <= bound + fudge );
+ }
+
+ bool mid( double amin , double amax , double bmin , double bmax , bool min , double& res ) const {
+ assert( amin <= amax );
+ assert( bmin <= bmax );
+
+ if ( amin < bmin ) {
+ if ( amax < bmin )
+ return false;
+ res = min ? bmin : amax;
+ return true;
+ }
+ if ( amin > bmax )
+ return false;
+ res = min ? amin : bmax;
+ return true;
+ }
+
+ double intersects( const Box& other ) const {
+
+ Point boundMin(0,0);
+ Point boundMax(0,0);
+
+ if ( mid( _min._x , _max._x , other._min._x , other._max._x , true , boundMin._x ) == false ||
+ mid( _min._x , _max._x , other._min._x , other._max._x , false , boundMax._x ) == false ||
+ mid( _min._y , _max._y , other._min._y , other._max._y , true , boundMin._y ) == false ||
+ mid( _min._y , _max._y , other._min._y , other._max._y , false , boundMax._y ) == false )
+ return 0;
+
+ Box intersection( boundMin , boundMax );
+
+ return intersection.area() / area();
+ }
+
+ double area() const {
+ return ( _max._x - _min._x ) * ( _max._y - _min._y );
+ }
+
+ double maxDim() const {
+ return max( _max._x - _min._x, _max._y - _min._y );
+ }
+
+ Point center() const {
+ return Point( ( _min._x + _max._x ) / 2 ,
+ ( _min._y + _max._y ) / 2 );
+ }
+
+ void truncate( const Geo2dType* g ) {
+ if( _min._x < g->_min ) _min._x = g->_min;
+ if( _min._y < g->_min ) _min._y = g->_min;
+ if( _max._x > g->_max ) _max._x = g->_max;
+ if( _max._y > g->_max ) _max._y = g->_max;
+ }
+
+ void fudge( const Geo2dType* g ) {
+ _min._x -= g->_error;
+ _min._y -= g->_error;
+ _max._x += g->_error;
+ _max._y += g->_error;
+ }
+
+ bool onBoundary( Point p, double fudge = 0 ) {
+ return onBoundary( _min._x, p._x, fudge ) ||
+ onBoundary( _max._x, p._x, fudge ) ||
+ onBoundary( _min._y, p._y, fudge ) ||
+ onBoundary( _max._y, p._y, fudge );
+ }
+
+ bool inside( Point p , double fudge = 0 ) {
+ bool res = inside( p._x , p._y , fudge );
+ //cout << "is : " << p.toString() << " in " << toString() << " = " << res << endl;
+ return res;
+ }
+
+ bool inside( double x , double y , double fudge = 0 ) {
+ return
+ between( _min._x , _max._x , x , fudge ) &&
+ between( _min._y , _max._y , y , fudge );
+ }
+
+ bool contains(const Box& other, double fudge=0) {
+ return inside(other._min, fudge) && inside(other._max, fudge);
+ }
+
+ Point _min;
+ Point _max;
+ };
+
+
+ class Polygon {
+ public:
+
+ Polygon( void ) : _centroidCalculated( false ) {}
+
+ Polygon( vector<Point> points ) : _centroidCalculated( false ),
+ _points( points ) { }
+
+ void add( Point p ) {
+ _centroidCalculated = false;
+ _points.push_back( p );
+ }
+
+ int size( void ) const {
+ return _points.size();
+ }
+
+ /**
+ * Determine if the point supplied is contained by the current polygon.
+ *
+ * The algorithm uses a ray casting method.
+ */
+ bool contains( const Point& p ) const {
+ return contains( p, 0 ) > 0;
+ }
+
+ int contains( const Point &p, double fudge ) const {
+
+ Box fudgeBox( Point( p._x - fudge, p._y - fudge ), Point( p._x + fudge, p._y + fudge ) );
+
+ int counter = 0;
+ Point p1 = _points[0];
+ for ( int i = 1; i <= size(); i++ ) {
+ Point p2 = _points[i % size()];
+
+ GEODEBUG( "Doing intersection check of " << fudgeBox.toString() << " with seg " << p1.toString() << " to " << p2.toString() );
+
+ // We need to check whether or not this segment intersects our error box
+ if( fudge > 0 &&
+ // Points not too far below box
+ fudgeBox._min._y <= std::max( p1._y, p2._y ) &&
+ // Points not too far above box
+ fudgeBox._max._y >= std::min( p1._y, p2._y ) &&
+ // Points not too far to left of box
+ fudgeBox._min._x <= std::max( p1._x, p2._x ) &&
+ // Points not too far to right of box
+ fudgeBox._max._x >= std::min( p1._x, p2._x ) ) {
+
+ GEODEBUG( "Doing detailed check" );
+
+ // If our box contains one or more of these points, we need to do an exact check.
+ if( fudgeBox.inside(p1) ) {
+ GEODEBUG( "Point 1 inside" );
+ return 0;
+ }
+ if( fudgeBox.inside(p2) ) {
+ GEODEBUG( "Point 2 inside" );
+ return 0;
+ }
+
+ // Do intersection check for vertical sides
+ if ( p1._y != p2._y ) {
+
+ double invSlope = ( p2._x - p1._x ) / ( p2._y - p1._y );
+
+ double xintersT = ( fudgeBox._max._y - p1._y ) * invSlope + p1._x;
+ if( fudgeBox._min._x <= xintersT && fudgeBox._max._x >= xintersT ) {
+ GEODEBUG( "Top intersection @ " << xintersT );
+ return 0;
+ }
+
+ double xintersB = ( fudgeBox._min._y - p1._y ) * invSlope + p1._x;
+ if( fudgeBox._min._x <= xintersB && fudgeBox._max._x >= xintersB ) {
+ GEODEBUG( "Bottom intersection @ " << xintersB );
+ return 0;
+ }
+
+ }
+
+ // Do intersection check for horizontal sides
+ if( p1._x != p2._x ) {
+
+ double slope = ( p2._y - p1._y ) / ( p2._x - p1._x );
+
+ double yintersR = ( p1._x - fudgeBox._max._x ) * slope + p1._y;
+ if( fudgeBox._min._y <= yintersR && fudgeBox._max._y >= yintersR ) {
+ GEODEBUG( "Right intersection @ " << yintersR );
+ return 0;
+ }
+
+ double yintersL = ( p1._x - fudgeBox._min._x ) * slope + p1._y;
+ if( fudgeBox._min._y <= yintersL && fudgeBox._max._y >= yintersL ) {
+ GEODEBUG( "Left intersection @ " << yintersL );
+ return 0;
+ }
+
+ }
+
+ }
+ else if( fudge == 0 ){
+
+ // If this is an exact vertex, we won't intersect, so check this
+ if( p._y == p1._y && p._x == p1._x ) return 1;
+ else if( p._y == p2._y && p._x == p2._x ) return 1;
+
+ // If this is a horizontal line we won't intersect, so check this
+ if( p1._y == p2._y && p._y == p1._y ){
+ // Check that the x-coord lies in the line
+ if( p._x >= std::min( p1._x, p2._x ) && p._x <= std::max( p1._x, p2._x ) ) return 1;
+ }
+
+ }
+
+ // Normal intersection test.
+ // TODO: Invert these for clearer logic?
+ if ( p._y > std::min( p1._y, p2._y ) ) {
+ if ( p._y <= std::max( p1._y, p2._y ) ) {
+ if ( p._x <= std::max( p1._x, p2._x ) ) {
+ if ( p1._y != p2._y ) {
+ double xinters = (p._y-p1._y)*(p2._x-p1._x)/(p2._y-p1._y)+p1._x;
+ // Special case of point on vertical line
+ if ( p1._x == p2._x && p._x == p1._x ){
+
+ // Need special case for the vertical edges, for example:
+ // 1) \e pe/----->
+ // vs.
+ // 2) \ep---e/----->
+ //
+ // if we count exact as intersection, then 1 is in but 2 is out
+ // if we count exact as no-int then 1 is out but 2 is in.
+
+ return 1;
+ }
+ else if( p1._x == p2._x || p._x <= xinters ) {
+ counter++;
+ }
+ }
+ }
+ }
+ }
+
+ p1 = p2;
+ }
+
+ if ( counter % 2 == 0 ) {
+ return -1;
+ }
+ else {
+ return 1;
+ }
+ }
+
+ /**
+ * Calculate the centroid, or center of mass of the polygon object.
+ */
+ Point centroid( void ) {
+
+ /* Centroid is cached, it won't change betwen points */
+ if ( _centroidCalculated ) {
+ return _centroid;
+ }
+
+ Point cent;
+ double signedArea = 0.0;
+ double area = 0.0; // Partial signed area
+
+ /// For all vertices except last
+ int i = 0;
+ for ( i = 0; i < size() - 1; ++i ) {
+ area = _points[i]._x * _points[i+1]._y - _points[i+1]._x * _points[i]._y ;
+ signedArea += area;
+ cent._x += ( _points[i]._x + _points[i+1]._x ) * area;
+ cent._y += ( _points[i]._y + _points[i+1]._y ) * area;
+ }
+
+ // Do last vertex
+ area = _points[i]._x * _points[0]._y - _points[0]._x * _points[i]._y;
+ cent._x += ( _points[i]._x + _points[0]._x ) * area;
+ cent._y += ( _points[i]._y + _points[0]._y ) * area;
+ signedArea += area;
+ signedArea *= 0.5;
+ cent._x /= ( 6 * signedArea );
+ cent._y /= ( 6 * signedArea );
+
+ _centroidCalculated = true;
+ _centroid = cent;
+
+ return cent;
+ }
+
+ Box bounds( void ) {
+
+ // TODO: Cache this
+
+ _bounds._max = _points[0];
+ _bounds._min = _points[0];
+
+ for ( int i = 1; i < size(); i++ ) {
+
+ _bounds._max._x = max( _bounds._max._x, _points[i]._x );
+ _bounds._max._y = max( _bounds._max._y, _points[i]._y );
+ _bounds._min._x = min( _bounds._min._x, _points[i]._x );
+ _bounds._min._y = min( _bounds._min._y, _points[i]._y );
+
+ }
+
+ return _bounds;
+
+ }
+
+ private:
+
+ bool _centroidCalculated;
+ Point _centroid;
+
+ Box _bounds;
+
+ vector<Point> _points;
+ };
+
+ class Geo2dPlugin : public IndexPlugin {
+ public:
+ Geo2dPlugin() : IndexPlugin( GEO2DNAME ) {
+ }
+
+ virtual IndexType* generate( const IndexSpec* spec ) const {
+ return new Geo2dType( this , spec );
+ }
+ } geo2dplugin;
+
+ void __forceLinkGeoPlugin() {
+ geo2dplugin.getName();
+ }
+
+
+
+ class GeoHopper;
+
+ class GeoPoint {
+ public:
+
+ GeoPoint() : _distance( -1 ), _exact( false ), _dirty( false )
+ {}
+
+ //// Distance not used ////
+
+ GeoPoint( const GeoKeyNode& node )
+ : _key( node._key ) , _loc( node.recordLoc ) , _o( node.recordLoc.obj() ), _distance( -1 ) , _exact( false ), _dirty( false ), _bucket( node._bucket ), _pos( node._keyOfs ) {
+ }
+
+ //// Immediate initialization of distance ////
+
+ GeoPoint( const GeoKeyNode& node, double distance, bool exact )
+ : _key( node._key ) , _loc( node.recordLoc ) , _o( node.recordLoc.obj() ), _distance( distance ), _exact( exact ), _dirty( false ) {
+ }
+
+ GeoPoint( const GeoPoint& pt, double distance, bool exact )
+ : _key( pt.key() ) , _loc( pt.loc() ) , _o( pt.obj() ), _distance( distance ), _exact( exact ), _dirty( false ) {
+ }
+
+ bool operator<( const GeoPoint& other ) const {
+ if( _distance != other._distance ) return _distance < other._distance;
+ if( _exact != other._exact ) return _exact < other._exact;
+ return _loc < other._loc;
+ }
+
+ double distance() const {
+ return _distance;
+ }
+
+ bool isExact() const {
+ return _exact;
+ }
+
+ BSONObj key() const {
+ return _key;
+ }
+
+ bool hasLoc() const {
+ return _loc.isNull();
+ }
+
+ DiskLoc loc() const {
+ assert( ! _dirty );
+ return _loc;
+ }
+
+ BSONObj obj() const {
+ return _o;
+ }
+
+ BSONObj pt() const {
+ return _pt;
+ }
+
+ bool isEmpty() {
+ return _o.isEmpty();
+ }
+
+ bool isCleanAndEmpty() {
+ return isEmpty() && ! isDirty();
+ }
+
+ string toString() const {
+ return str::stream() << "Point from " << _key << " - " << _o << " dist : " << _distance << ( _exact ? " (ex)" : " (app)" );
+ }
+
+
+ // TODO: Recover from yield by finding all the changed disk locs here, modifying the _seenPts array.
+ // Not sure yet the correct thing to do about _seen.
+ // Definitely need to re-find our current max/min locations too
+ bool unDirty( const Geo2dType* g, DiskLoc& oldLoc ){
+
+ assert( _dirty );
+ assert( ! _id.isEmpty() );
+
+ oldLoc = _loc;
+ _loc = DiskLoc();
+
+ // Fast undirty
+ IndexInterface& ii = g->getDetails()->idxInterface();
+ // Check this position and the one immediately preceding
+ for( int i = 0; i < 2; i++ ){
+ if( _pos - i < 0 ) continue;
+
+ // log() << "bucket : " << _bucket << " pos " << _pos << endl;
+
+ BSONObj key;
+ DiskLoc loc;
+ ii.keyAt( _bucket, _pos - i, key, loc );
+
+ // log() << "Loc: " << loc << " Key : " << key << endl;
+
+ if( loc.isNull() ) continue;
+
+ if( key.binaryEqual( _key ) && loc.obj()["_id"].wrap( "" ).binaryEqual( _id ) ){
+ _pos = _pos - i;
+ _loc = loc;
+ _dirty = false;
+ _o = loc.obj();
+ return true;
+ }
+ }
+
+ // Slow undirty
+ scoped_ptr<BtreeCursor> cursor( BtreeCursor::make( nsdetails( g->getDetails()->parentNS().c_str() ),
+ *( g->getDetails() ), _key, _key, true, 1 ) );
+
+ int count = 0;
+ while( cursor->ok() ){
+ count++;
+ if( cursor->current()["_id"].wrap( "" ).binaryEqual( _id ) ){
+ _bucket = cursor->getBucket();
+ _pos = cursor->getKeyOfs();
+ _loc = cursor->currLoc();
+ _o = _loc.obj();
+ break;
+ }
+ else{
+ LOG( CDEBUG + 1 ) << "Key doesn't match : " << cursor->current()["_id"] << " saved : " << _id << endl;
+ }
+ cursor->advance();
+ }
+
+ if( ! count ) { LOG( CDEBUG ) << "No key found for " << _key << endl; }
+
+ _dirty = false;
+
+ return _loc == oldLoc;
+ }
+
+ bool isDirty(){
+ return _dirty;
+ }
+
+ bool makeDirty(){
+ if( ! _dirty ){
+ assert( ! obj()["_id"].eoo() );
+ assert( ! _bucket.isNull() );
+ assert( _pos >= 0 );
+
+ if( _id.isEmpty() ){
+ _id = obj()["_id"].wrap( "" ).getOwned();
+ }
+ _o = BSONObj();
+ _key = _key.getOwned();
+ _pt = _pt.getOwned();
+ _dirty = true;
+
+ return true;
+ }
+
+ return false;
+ }
+
+ BSONObj _key;
+ DiskLoc _loc;
+ BSONObj _o;
+ BSONObj _pt;
+
+ double _distance;
+ bool _exact;
+
+ BSONObj _id;
+ bool _dirty;
+ DiskLoc _bucket;
+ int _pos;
+ };
+
+ // GeoBrowse subclasses this
+ class GeoAccumulator {
+ public:
+ GeoAccumulator( const Geo2dType * g , const BSONObj& filter, bool uniqueDocs, bool needDistance )
+ : _g(g) ,
+ _lookedAt(0) ,
+ _matchesPerfd(0) ,
+ _objectsLoaded(0) ,
+ _pointsLoaded(0) ,
+ _found(0) ,
+ _uniqueDocs( uniqueDocs ) ,
+ _needDistance( needDistance )
+ {
+ if ( ! filter.isEmpty() ) {
+ _matcher.reset( new CoveredIndexMatcher( filter , g->keyPattern() ) );
+ GEODEBUG( "Matcher is now " << _matcher->docMatcher().toString() );
+ }
+ }
+
+ virtual ~GeoAccumulator() { }
+
+ enum KeyResult { BAD, BORDER, GOOD };
+
+ virtual void add( const GeoKeyNode& node ) {
+
+ GEODEBUG( "\t\t\t\t checking key " << node._key.toString() )
+
+ _lookedAt++;
+
+ ////
+ // Approximate distance check using key data
+ ////
+ double keyD = 0;
+ Point keyP( _g, GeoHash( node._key.firstElement(), _g->_bits ) );
+ KeyResult keyOk = approxKeyCheck( keyP, keyD );
+ if ( keyOk == BAD ) {
+ GEODEBUG( "\t\t\t\t bad distance : " << node.recordLoc.obj() << "\t" << keyD );
+ return;
+ }
+ GEODEBUG( "\t\t\t\t good distance : " << node.recordLoc.obj() << "\t" << keyD );
+
+ ////
+ // Check for match using other key (and potentially doc) criteria
+ ////
+ // Remember match results for each object
+ map<DiskLoc, bool>::iterator match = _matched.find( node.recordLoc );
+ bool newDoc = match == _matched.end();
+ if( newDoc ) {
+
+ GEODEBUG( "\t\t\t\t matching new doc with " << (_matcher ? _matcher->docMatcher().toString() : "(empty)" ) );
+
+ // matcher
+ MatchDetails details;
+ if ( _matcher.get() ) {
+ bool good = _matcher->matchesWithSingleKeyIndex( node._key , node.recordLoc , &details );
+
+ _matchesPerfd++;
+
+ if ( details._loadedObject )
+ _objectsLoaded++;
+
+ if ( ! good ) {
+ GEODEBUG( "\t\t\t\t didn't match : " << node.recordLoc.obj()["_id"] );
+ _matched[ node.recordLoc ] = false;
+ return;
+ }
+ }
+
+ _matched[ node.recordLoc ] = true;
+
+ if ( ! details._loadedObject ) // don't double count
+ _objectsLoaded++;
+
+ }
+ else if( !((*match).second) ) {
+ GEODEBUG( "\t\t\t\t previously didn't match : " << node.recordLoc.obj()["_id"] );
+ return;
+ }
+
+ ////
+ // Exact check with particular data fields
+ ////
+ // Can add multiple points
+ int diff = addSpecific( node , keyP, keyOk == BORDER, keyD, newDoc );
+ if( diff > 0 ) _found += diff;
+ else _found -= -diff;
+
+ }
+
+ virtual void getPointsFor( const BSONObj& key, const BSONObj& obj, vector< BSONObj >& locsForNode, bool allPoints = false ){
+
+ // Find all the location objects from the keys
+ vector< BSONObj > locs;
+ _g->getKeys( obj, allPoints ? locsForNode : locs );
+ _pointsLoaded++;
+
+ if( allPoints ) return;
+ if( locs.size() == 1 ){
+ locsForNode.push_back( locs[0] );
+ return;
+ }
+
+ // Find the particular location we want
+ GeoHash keyHash( key.firstElement(), _g->_bits );
+
+ // log() << "Hash: " << node.key << " and " << keyHash.getHash() << " unique " << _uniqueDocs << endl;
+ for( vector< BSONObj >::iterator i = locs.begin(); i != locs.end(); ++i ) {
+
+ // Ignore all locations not hashed to the key's hash, since we may see
+ // those later
+ if( _g->_hash( *i ) != keyHash ) continue;
+
+ locsForNode.push_back( *i );
+
+ }
+
+ }
+
+ virtual int addSpecific( const GeoKeyNode& node, const Point& p , bool inBounds, double d, bool newDoc ) = 0;
+ virtual KeyResult approxKeyCheck( const Point& p , double& keyD ) = 0;
+ virtual bool exactDocCheck( const Point& p , double& d ) = 0;
+ virtual bool expensiveExactCheck(){ return false; }
+
+
+ long long found() const {
+ return _found;
+ }
+
+ const Geo2dType * _g;
+ map<DiskLoc, bool> _matched;
+ shared_ptr<CoveredIndexMatcher> _matcher;
+
+ long long _lookedAt;
+ long long _matchesPerfd;
+ long long _objectsLoaded;
+ long long _pointsLoaded;
+ long long _found;
+
+ bool _uniqueDocs;
+ bool _needDistance;
+
+ };
+
+
+ struct BtreeLocation {
+ BtreeLocation() { }
+
+ scoped_ptr<BtreeCursor> _cursor;
+ scoped_ptr<FieldRangeSet> _frs;
+ scoped_ptr<IndexSpec> _spec;
+
+ BSONObj key() {
+ return _cursor->currKey();
+ }
+
+ bool hasPrefix( const GeoHash& hash ) {
+ BSONObj k = key();
+ BSONElement e = k.firstElement();
+ if ( e.eoo() )
+ return false;
+ return GeoHash( e ).hasPrefix( hash );
+ }
+
+ bool checkAndAdvance( const GeoHash& hash, int& totalFound, GeoAccumulator* all ){
+ if( ! _cursor->ok() || ! hasPrefix( hash ) ) return false;
+
+ if( all ){
+ totalFound++;
+ GeoKeyNode n( _cursor->getBucket(), _cursor->getKeyOfs(), _cursor->currLoc(), _cursor->currKey() );
+ all->add( n );
+ }
+ _cursor->advance();
+
+ return true;
+ }
+
+ void save(){
+ _cursor->noteLocation();
+ }
+
+ void restore(){
+ _cursor->checkLocation();
+ }
+
+ string toString() {
+ stringstream ss;
+ ss << "bucket: " << _cursor->getBucket().toString() << " pos: " << _cursor->getKeyOfs() <<
+ ( _cursor->ok() ? ( str::stream() << " k: " << _cursor->currKey() << " o : " << _cursor->current()["_id"] ) : (string)"[none]" ) << endl;
+ return ss.str();
+ }
+
+ // Returns the min and max keys which bound a particular location.
+ // The only time these may be equal is when we actually equal the location
+ // itself, otherwise our expanding algorithm will fail.
+ static bool initial( const IndexDetails& id , const Geo2dType * spec ,
+ BtreeLocation& min , BtreeLocation& max ,
+ GeoHash start ,
+ int & found , GeoAccumulator * hopper ) {
+
+ //Ordering ordering = Ordering::make(spec->_order);
+
+ // Would be nice to build this directly, but bug in max/min queries SERVER-3766 and lack of interface
+ // makes this easiest for now.
+ BSONObj minQuery = BSON( spec->_geo << BSON( "$gt" << MINKEY << start.wrap( "$lte" ).firstElement() ) );
+ BSONObj maxQuery = BSON( spec->_geo << BSON( "$lt" << MAXKEY << start.wrap( "$gt" ).firstElement() ) );
+
+ // log() << "MinQuery: " << minQuery << endl;
+ // log() << "MaxQuery: " << maxQuery << endl;
+
+ min._frs.reset( new FieldRangeSet( spec->getDetails()->parentNS().c_str(),
+ minQuery,
+ true,
+ false ) );
+
+ max._frs.reset( new FieldRangeSet( spec->getDetails()->parentNS().c_str(),
+ maxQuery,
+ true,
+ false ) );
+
+
+ BSONObjBuilder bob;
+ bob.append( spec->_geo, 1 );
+ for( vector<string>::const_iterator i = spec->_other.begin(); i != spec->_other.end(); i++ ){
+ bob.append( *i, 1 );
+ }
+ BSONObj iSpec = bob.obj();
+
+ min._spec.reset( new IndexSpec( iSpec ) );
+ max._spec.reset( new IndexSpec( iSpec ) );
+
+ shared_ptr<FieldRangeVector> frvMin( new FieldRangeVector( *(min._frs), *(min._spec), -1 ) );
+ shared_ptr<FieldRangeVector> frvMax( new FieldRangeVector( *(max._frs), *(max._spec), 1 ) );
+
+ min._cursor.reset(
+ BtreeCursor::make( nsdetails( spec->getDetails()->parentNS().c_str() ), *( spec->getDetails() ),
+ frvMin, -1 )
+ );
+
+ max._cursor.reset(
+ BtreeCursor::make( nsdetails( spec->getDetails()->parentNS().c_str() ), *( spec->getDetails() ),
+ frvMax, 1 )
+ );
+
+ // if( hopper ) min.checkCur( found, hopper );
+ // if( hopper ) max.checkCur( found, hopper );
+
+ return min._cursor->ok() || max._cursor->ok();
+ }
+ };
+
+
+ class GeoCursorBase : public Cursor {
+ public:
+
+ static const shared_ptr< CoveredIndexMatcher > emptyMatcher;
+
+ GeoCursorBase( const Geo2dType * spec )
+ : _spec( spec ), _id( _spec->getDetails() ) {
+
+ }
+
+ virtual DiskLoc refLoc() { return DiskLoc(); }
+
+ virtual BSONObj indexKeyPattern() {
+ return _spec->keyPattern();
+ }
+
+ virtual void noteLocation() {
+ // no-op since these are meant to be safe
+ }
+
+ /* called before query getmore block is iterated */
+ virtual void checkLocation() {
+ // no-op since these are meant to be safe
+ }
+
+ virtual bool supportGetMore() { return false; }
+ virtual bool supportYields() { return false; }
+
+ virtual bool getsetdup(DiskLoc loc) { return false; }
+ virtual bool modifiedKeys() const { return true; }
+ virtual bool isMultiKey() const { return false; }
+
+ virtual bool autoDedup() const { return false; }
+
+ const Geo2dType * _spec;
+ const IndexDetails * _id;
+ };
+
+ const shared_ptr< CoveredIndexMatcher > GeoCursorBase::emptyMatcher( new CoveredIndexMatcher( BSONObj(), BSONObj(), false ) );
+
+ // TODO: Pull out the cursor bit from the browse, have GeoBrowse as field of cursor to clean up
+ // this hierarchy a bit. Also probably useful to look at whether GeoAccumulator can be a member instead
+ // of a superclass.
+ class GeoBrowse : public GeoCursorBase , public GeoAccumulator {
+ public:
+
+ // The max points which should be added to an expanding box at one time
+ static const int maxPointsHeuristic = 50;
+
+ // Expand states
+ enum State {
+ START ,
+ DOING_EXPAND ,
+ DONE_NEIGHBOR ,
+ DONE
+ } _state;
+
+ GeoBrowse( const Geo2dType * g , string type , BSONObj filter = BSONObj(), bool uniqueDocs = true, bool needDistance = false )
+ : GeoCursorBase( g ), GeoAccumulator( g , filter, uniqueDocs, needDistance ) ,
+ _type( type ) , _filter( filter ) , _firstCall(true), _noted( false ), _nscanned(), _nDirtied(0), _nChangedOnYield(0), _nRemovedOnYield(0), _centerPrefix(0, 0, 0) {
+
+ // Set up the initial expand state
+ _state = START;
+ _neighbor = -1;
+ _foundInExp = 0;
+
+ }
+
+ virtual string toString() {
+ return (string)"GeoBrowse-" + _type;
+ }
+
+ virtual bool ok() {
+
+ bool filled = false;
+
+ LOG( CDEBUG ) << "Checking cursor, in state " << (int) _state << ", first call " << _firstCall <<
+ ", empty : " << _cur.isEmpty() << ", dirty : " << _cur.isDirty() << ", stack : " << _stack.size() << endl;
+
+ bool first = _firstCall;
+ if ( _firstCall ) {
+ fillStack( maxPointsHeuristic );
+ filled = true;
+ _firstCall = false;
+ }
+ if ( ! _cur.isCleanAndEmpty() || _stack.size() ) {
+ if ( first ) {
+ ++_nscanned;
+ }
+
+ if( _noted && filled ) noteLocation();
+ return true;
+ }
+
+ while ( moreToDo() ) {
+
+ LOG( CDEBUG ) << "Refilling stack..." << endl;
+
+ fillStack( maxPointsHeuristic );
+ filled = true;
+
+ if ( ! _cur.isCleanAndEmpty() ) {
+ if ( first ) {
+ ++_nscanned;
+ }
+
+ if( _noted && filled ) noteLocation();
+ return true;
+ }
+ }
+
+ if( _noted && filled ) noteLocation();
+ return false;
+ }
+
+ virtual bool advance() {
+ _cur._o = BSONObj();
+
+ if ( _stack.size() ) {
+ _cur = _stack.front();
+ _stack.pop_front();
+ ++_nscanned;
+ return true;
+ }
+
+ if ( ! moreToDo() )
+ return false;
+
+ bool filled = false;
+ while ( _cur.isCleanAndEmpty() && moreToDo() ){
+ fillStack( maxPointsHeuristic );
+ filled = true;
+ }
+
+ if( _noted && filled ) noteLocation();
+ return ! _cur.isCleanAndEmpty() && ++_nscanned;
+ }
+
+ virtual void noteLocation() {
+ _noted = true;
+
+ LOG( CDEBUG ) << "Noting location with " << _stack.size() << ( _cur.isEmpty() ? "" : " + 1 " ) << " points " << endl;
+
+ // Make sure we advance past the point we're at now,
+ // since the current location may move on an update/delete
+ // if( _state == DOING_EXPAND ){
+ // if( _min.hasPrefix( _prefix ) ){ _min.advance( -1, _foundInExp, this ); }
+ // if( _max.hasPrefix( _prefix ) ){ _max.advance( 1, _foundInExp, this ); }
+ // }
+
+ // Remember where our _max, _min are
+ _min.save();
+ _max.save();
+
+ LOG( CDEBUG ) << "Min " << _min.toString() << endl;
+ LOG( CDEBUG ) << "Max " << _max.toString() << endl;
+
+ // Dirty all our queued stuff
+ for( list<GeoPoint>::iterator i = _stack.begin(); i != _stack.end(); i++ ){
+
+ LOG( CDEBUG ) << "Undirtying stack point with id " << i->_id << endl;
+
+ if( i->makeDirty() ) _nDirtied++;
+ assert( i->isDirty() );
+ }
+
+ // Check current item
+ if( ! _cur.isEmpty() ){
+ if( _cur.makeDirty() ) _nDirtied++;
+ }
+
+ // Our cached matches become invalid now
+ _matched.clear();
+ }
+
+ void fixMatches( DiskLoc oldLoc, DiskLoc newLoc ){
+ map<DiskLoc, bool>::iterator match = _matched.find( oldLoc );
+ if( match != _matched.end() ){
+ bool val = match->second;
+ _matched.erase( oldLoc );
+ _matched[ newLoc ] = val;
+ }
+ }
+
+ /* called before query getmore block is iterated */
+ virtual void checkLocation() {
+
+ LOG( CDEBUG ) << "Restoring location with " << _stack.size() << ( ! _cur.isDirty() ? "" : " + 1 " ) << " points " << endl;
+
+ // We can assume an error was thrown earlier if this database somehow disappears
+
+ // Recall our _max, _min
+ _min.restore();
+ _max.restore();
+
+ LOG( CDEBUG ) << "Min " << _min.toString() << endl;
+ LOG( CDEBUG ) << "Max " << _max.toString() << endl;
+
+ // If the current key moved, we may have been advanced past the current point - need to check this
+ // if( _state == DOING_EXPAND ){
+ // if( _min.hasPrefix( _prefix ) ){ _min.advance( -1, _foundInExp, this ); }
+ // if( _max.hasPrefix( _prefix ) ){ _max.advance( 1, _foundInExp, this ); }
+ //}
+
+ // Undirty all the queued stuff
+ // Dirty all our queued stuff
+ list<GeoPoint>::iterator i = _stack.begin();
+ while( i != _stack.end() ){
+
+ LOG( CDEBUG ) << "Undirtying stack point with id " << i->_id << endl;
+
+ DiskLoc oldLoc;
+ if( i->unDirty( _spec, oldLoc ) ){
+ // Document is in same location
+ LOG( CDEBUG ) << "Undirtied " << oldLoc << endl;
+
+ i++;
+ }
+ else if( ! i->loc().isNull() ){
+
+ // Re-found document somewhere else
+ LOG( CDEBUG ) << "Changed location of " << i->_id << " : " << i->loc() << " vs " << oldLoc << endl;
+
+ _nChangedOnYield++;
+ fixMatches( oldLoc, i->loc() );
+ i++;
+ }
+ else {
+
+ // Can't re-find document
+ LOG( CDEBUG ) << "Removing document " << i->_id << endl;
+
+ _nRemovedOnYield++;
+ _found--;
+ assert( _found >= 0 );
+
+ // Can't find our key again, remove
+ i = _stack.erase( i );
+ }
+ }
+
+ if( _cur.isDirty() ){
+ LOG( CDEBUG ) << "Undirtying cur point with id : " << _cur._id << endl;
+ }
+
+ // Check current item
+ DiskLoc oldLoc;
+ if( _cur.isDirty() && ! _cur.unDirty( _spec, oldLoc ) ){
+ if( _cur.loc().isNull() ){
+
+ // Document disappeared!
+ LOG( CDEBUG ) << "Removing cur point " << _cur._id << endl;
+
+ _nRemovedOnYield++;
+ advance();
+ }
+ else{
+
+ // Document moved
+ LOG( CDEBUG ) << "Changed location of cur point " << _cur._id << " : " << _cur.loc() << " vs " << oldLoc << endl;
+
+ _nChangedOnYield++;
+ fixMatches( oldLoc, _cur.loc() );
+ }
+ }
+
+ _noted = false;
+ }
+
+ virtual Record* _current() { assert(ok()); LOG( CDEBUG + 1 ) << "_current " << _cur._loc.obj()["_id"] << endl; return _cur._loc.rec(); }
+ virtual BSONObj current() { assert(ok()); LOG( CDEBUG + 1 ) << "current " << _cur._o << endl; return _cur._o; }
+ virtual DiskLoc currLoc() { assert(ok()); LOG( CDEBUG + 1 ) << "currLoc " << _cur._loc << endl; return _cur._loc; }
+ virtual BSONObj currKey() const { return _cur._key; }
+
+ virtual CoveredIndexMatcher* matcher() const {
+ if( _matcher.get() ) return _matcher.get();
+ else return GeoCursorBase::emptyMatcher.get();
+ }
+
+ virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const {
+ if( _matcher.get() ) return _matcher;
+ else return GeoCursorBase::emptyMatcher;
+ }
+
+ // Are we finished getting points?
+ virtual bool moreToDo() {
+ return _state != DONE;
+ }
+
+ virtual bool supportGetMore() { return true; }
+
+ // Fills the stack, but only checks a maximum number of maxToCheck points at a time.
+ // Further calls to this function will continue the expand/check neighbors algorithm.
+ virtual void fillStack( int maxToCheck, int maxToAdd = -1, bool onlyExpand = false ) {
+
+#ifdef GEODEBUGGING
+ log() << "Filling stack with maximum of " << maxToCheck << ", state : " << (int) _state << endl;
+#endif
+
+ if( maxToAdd < 0 ) maxToAdd = maxToCheck;
+ int maxFound = _foundInExp + maxToCheck;
+ assert( maxToCheck > 0 );
+ assert( maxFound > 0 );
+ assert( _found <= 0x7fffffff ); // conversion to int
+ int maxAdded = static_cast<int>(_found) + maxToAdd;
+ assert( maxAdded >= 0 ); // overflow check
+
+ bool isNeighbor = _centerPrefix.constrains();
+
+ // Starting a box expansion
+ if ( _state == START ) {
+
+ // Get the very first hash point, if required
+ if( ! isNeighbor )
+ _prefix = expandStartHash();
+
+ GEODEBUG( "initializing btree" );
+
+#ifdef GEODEBUGGING
+ log() << "Initializing from b-tree with hash of " << _prefix << " @ " << Box( _g, _prefix ) << endl;
+#endif
+
+ if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , _prefix , _foundInExp , this ) )
+ _state = isNeighbor ? DONE_NEIGHBOR : DONE;
+ else {
+ _state = DOING_EXPAND;
+ _lastPrefix.reset();
+ }
+
+ GEODEBUG( (_state == DONE_NEIGHBOR || _state == DONE ? "not initialized" : "initializedFig") );
+
+ }
+
+ // Doing the actual box expansion
+ if ( _state == DOING_EXPAND ) {
+
+ while ( true ) {
+
+ GEODEBUG( "box prefix [" << _prefix << "]" );
+#ifdef GEODEBUGGING
+ if( _prefix.constrains() ) {
+ log() << "current expand box : " << Box( _g, _prefix ).toString() << endl;
+ }
+ else {
+ log() << "max expand box." << endl;
+ }
+#endif
+
+ GEODEBUG( "expanding box points... ");
+
+ // Record the prefix we're actively exploring...
+ _expPrefix.reset( new GeoHash( _prefix ) );
+
+ // Find points inside this prefix
+ while ( _min.checkAndAdvance( _prefix, _foundInExp, this ) && _foundInExp < maxFound && _found < maxAdded );
+ while ( _max.checkAndAdvance( _prefix, _foundInExp, this ) && _foundInExp < maxFound && _found < maxAdded );
+
+#ifdef GEODEBUGGING
+
+ log() << "finished expand, checked : " << ( maxToCheck - ( maxFound - _foundInExp ) )
+ << " found : " << ( maxToAdd - ( maxAdded - _found ) )
+ << " max : " << maxToCheck << " / " << maxToAdd << endl;
+
+#endif
+
+ GEODEBUG( "finished expand, found : " << ( maxToAdd - ( maxAdded - _found ) ) );
+ if( _foundInExp >= maxFound || _found >= maxAdded ) return;
+
+ // We've searched this prefix fully, remember
+ _lastPrefix.reset( new GeoHash( _prefix ));
+
+ // If we've searched the entire space, we're finished.
+ if ( ! _prefix.constrains() ) {
+ GEODEBUG( "box exhausted" );
+ _state = DONE;
+ notePrefix();
+ return;
+ }
+
+ // If we won't fit in the box, and we're not doing a sub-scan, increase the size
+ if ( ! fitsInBox( _g->sizeEdge( _prefix ) ) && _fringe.size() == 0 ) {
+
+ // If we're still not expanded bigger than the box size, expand again
+ // TODO: Is there an advantage to scanning prior to expanding?
+ _prefix = _prefix.up();
+ continue;
+
+ }
+
+ // log() << "finished box prefix [" << _prefix << "]" << endl;
+
+ // We're done and our size is large enough
+ _state = DONE_NEIGHBOR;
+
+ // Go to the next sub-box, if applicable
+ if( _fringe.size() > 0 ) _fringe.pop_back();
+ // Go to the next neighbor if this was the last sub-search
+ if( _fringe.size() == 0 ) _neighbor++;
+
+ break;
+
+ }
+
+ notePrefix();
+ }
+
+ // If we doeighbors
+ if( onlyExpand ) return;
+
+ // If we're done expanding the current box...
+ if( _state == DONE_NEIGHBOR ) {
+
+ // Iterate to the next neighbor
+ // Loop is useful for cases where we want to skip over boxes entirely,
+ // otherwise recursion increments the neighbors.
+ for ( ; _neighbor < 9; _neighbor++ ) {
+
+ // If we have no fringe for the neighbor, make sure we have the default fringe
+ if( _fringe.size() == 0 ) _fringe.push_back( "" );
+
+ if( ! isNeighbor ) {
+ _centerPrefix = _prefix;
+ _centerBox = Box( _g, _centerPrefix );
+ isNeighbor = true;
+ }
+
+ int i = (_neighbor / 3) - 1;
+ int j = (_neighbor % 3) - 1;
+
+ if ( ( i == 0 && j == 0 ) ||
+ ( i < 0 && _centerPrefix.atMinX() ) ||
+ ( i > 0 && _centerPrefix.atMaxX() ) ||
+ ( j < 0 && _centerPrefix.atMinY() ) ||
+ ( j > 0 && _centerPrefix.atMaxY() ) ) {
+
+ //log() << "not moving to neighbor " << _neighbor << " @ " << i << " , " << j << " fringe : " << _fringe.size() << " " << _centerPrefix << endl;
+ //log() << _centerPrefix.atMinX() << " "
+ // << _centerPrefix.atMinY() << " "
+ // << _centerPrefix.atMaxX() << " "
+ // << _centerPrefix.atMaxY() << " " << endl;
+
+ continue; // main box or wrapped edge
+ // TODO: We may want to enable wrapping in future, probably best as layer on top of
+ // this search.
+ }
+
+ // Make sure we've got a reasonable center
+ assert( _centerPrefix.constrains() );
+
+ GeoHash _neighborPrefix = _centerPrefix;
+ _neighborPrefix.move( i, j );
+
+ //log() << "moving to neighbor " << _neighbor << " @ " << i << " , " << j << " fringe : " << _fringe.size() << " " << _centerPrefix << " " << _neighborPrefix << endl;
+
+ GEODEBUG( "moving to neighbor " << _neighbor << " @ " << i << " , " << j << " fringe : " << _fringe.size() );
+ PREFIXDEBUG( _centerPrefix, _g );
+ PREFIXDEBUG( _neighborPrefix , _g );
+ while( _fringe.size() > 0 ) {
+
+ _prefix = _neighborPrefix + _fringe.back();
+ Box cur( _g , _prefix );
+
+ PREFIXDEBUG( _prefix, _g );
+
+ double intAmt = intersectsBox( cur );
+
+ // No intersection
+ if( intAmt <= 0 ) {
+ GEODEBUG( "skipping box" << cur.toString() );
+ _fringe.pop_back();
+ continue;
+ }
+ // Small intersection, refine search
+ else if( intAmt < 0.5 && _prefix.canRefine() && _fringe.back().size() < 4 /* two bits */ ) {
+
+ GEODEBUG( "Intersection small : " << intAmt << ", adding to fringe: " << _fringe.back() << " curr prefix : " << _prefix << " bits : " << _prefix.getBits() );
+
+ // log() << "Diving to level : " << ( _fringe.back().size() / 2 + 1 ) << endl;
+
+ string lastSuffix = _fringe.back();
+ _fringe.pop_back();
+ _fringe.push_back( lastSuffix + "00" );
+ _fringe.push_back( lastSuffix + "01" );
+ _fringe.push_back( lastSuffix + "11" );
+ _fringe.push_back( lastSuffix + "10" );
+
+ continue;
+ }
+
+ // Restart our search from a diff box.
+ _state = START;
+
+ assert( ! onlyExpand );
+
+ assert( _found <= 0x7fffffff );
+ fillStack( maxFound - _foundInExp, maxAdded - static_cast<int>(_found) );
+
+ // When we return from the recursive fillStack call, we'll either have checked enough points or
+ // be entirely done. Max recurse depth is < 8 * 16.
+
+ // If we're maxed out on points, return
+ if( _foundInExp >= maxFound || _found >= maxAdded ) {
+ // Make sure we'll come back to add more points
+ assert( _state == DOING_EXPAND );
+ return;
+ }
+
+ // Otherwise we must be finished to return
+ assert( _state == DONE );
+ return;
+
+ }
+
+ }
+
+ // Finished with neighbors
+ _state = DONE;
+ }
+
+ }
+
+ // The initial geo hash box for our first expansion
+ virtual GeoHash expandStartHash() = 0;
+
+ // Whether the current box width is big enough for our search area
+ virtual bool fitsInBox( double width ) = 0;
+
+ // The amount the current box overlaps our search area
+ virtual double intersectsBox( Box& cur ) = 0;
+
+ bool remembered( BSONObj o ){
+ BSONObj seenId = o["_id"].wrap("").getOwned();
+ if( _seenIds.find( seenId ) != _seenIds.end() ){
+ LOG( CDEBUG + 1 ) << "Object " << o["_id"] << " already seen." << endl;
+ return true;
+ }
+ else{
+ _seenIds.insert( seenId );
+ LOG( CDEBUG + 1 ) << "Object " << o["_id"] << " remembered." << endl;
+ return false;
+ }
+ }
+
+ virtual int addSpecific( const GeoKeyNode& node , const Point& keyP , bool onBounds , double keyD , bool potentiallyNewDoc ) {
+
+ int found = 0;
+
+ // We need to handle every possible point in this method, even those not in the key value, to
+ // avoid us tracking which hashes we've already seen.
+ if( ! potentiallyNewDoc ){
+ // log() << "Already handled doc!" << endl;
+ return 0;
+ }
+
+ // Final check for new doc
+ // OK to touch, since we're probably returning this object now
+ if( remembered( node.recordLoc.obj() ) ) return 0;
+
+ if( _uniqueDocs && ! onBounds ) {
+ //log() << "Added ind to " << _type << endl;
+ _stack.push_front( GeoPoint( node ) );
+ found++;
+ }
+ else {
+ // We now handle every possible point in the document, even those not in the key value,
+ // since we're iterating through them anyway - prevents us from having to save the hashes
+ // we've seen per-doc
+
+ // If we're filtering by hash, get the original
+ bool expensiveExact = expensiveExactCheck();
+
+ vector< BSONObj > locs;
+ getPointsFor( node._key, node.recordLoc.obj(), locs, true );
+ for( vector< BSONObj >::iterator i = locs.begin(); i != locs.end(); ++i ){
+
+ double d = -1;
+ Point p( *i );
+
+ // We can avoid exact document checks by redoing approx checks,
+ // if the exact checks are more expensive.
+ bool needExact = true;
+ if( expensiveExact ){
+ assert( false );
+ KeyResult result = approxKeyCheck( p, d );
+ if( result == BAD ) continue;
+ else if( result == GOOD ) needExact = false;
+ }
+
+ if( ! needExact || exactDocCheck( p, d ) ){
+ //log() << "Added mult to " << _type << endl;
+ _stack.push_front( GeoPoint( node ) );
+ found++;
+ // If returning unique, just exit after first point is added
+ if( _uniqueDocs ) break;
+ }
+ }
+ }
+
+ while( _cur.isCleanAndEmpty() && _stack.size() > 0 ){
+ _cur = _stack.front();
+ _stack.pop_front();
+ }
+
+ return found;
+ }
+
+ virtual long long nscanned() {
+ if ( _firstCall ) {
+ ok();
+ }
+ return _nscanned;
+ }
+
+ virtual void explainDetails( BSONObjBuilder& b ){
+ b << "lookedAt" << _lookedAt;
+ b << "matchesPerfd" << _matchesPerfd;
+ b << "objectsLoaded" << _objectsLoaded;
+ b << "pointsLoaded" << _pointsLoaded;
+ b << "pointsSavedForYield" << _nDirtied;
+ b << "pointsChangedOnYield" << _nChangedOnYield;
+ b << "pointsRemovedOnYield" << _nRemovedOnYield;
+ }
+
+ virtual BSONObj prettyIndexBounds() const {
+
+ vector<GeoHash>::const_iterator i = _expPrefixes.end();
+ if( _expPrefixes.size() > 0 && *(--i) != *( _expPrefix.get() ) )
+ _expPrefixes.push_back( *( _expPrefix.get() ) );
+
+ BSONObjBuilder bob;
+ BSONArrayBuilder bab;
+ for( i = _expPrefixes.begin(); i != _expPrefixes.end(); ++i ){
+ bab << Box( _g, *i ).toBSON();
+ }
+ bob << _g->_geo << bab.arr();
+
+ return bob.obj();
+
+ }
+
+ void notePrefix() {
+ _expPrefixes.push_back( _prefix );
+ }
+
+ string _type;
+ BSONObj _filter;
+ list<GeoPoint> _stack;
+ set<BSONObj> _seenIds;
+
+ GeoPoint _cur;
+ bool _firstCall;
+ bool _noted;
+
+ long long _nscanned;
+ long long _nDirtied;
+ long long _nChangedOnYield;
+ long long _nRemovedOnYield;
+
+ // The current box we're expanding (-1 is first/center box)
+ int _neighbor;
+
+ // The points we've found so far
+ // TODO: Long long?
+ int _foundInExp;
+
+ // The current hash prefix we're expanding and the center-box hash prefix
+ GeoHash _prefix;
+ shared_ptr<GeoHash> _lastPrefix;
+ GeoHash _centerPrefix;
+ list<string> _fringe;
+ int recurseDepth;
+ Box _centerBox;
+
+ // Start and end of our search range in the current box
+ BtreeLocation _min;
+ BtreeLocation _max;
+
+ shared_ptr<GeoHash> _expPrefix;
+ mutable vector<GeoHash> _expPrefixes;
+
+ };
+
+
+ class GeoHopper : public GeoBrowse {
+ public:
+ typedef multiset<GeoPoint> Holder;
+
+ GeoHopper( const Geo2dType * g , unsigned max , const Point& n , const BSONObj& filter = BSONObj() , double maxDistance = numeric_limits<double>::max() , GeoDistType type=GEO_PLAIN, bool uniqueDocs = false, bool needDistance = true )
+ : GeoBrowse( g, "search", filter, uniqueDocs, needDistance ), _max( max ) , _near( n ), _maxDistance( maxDistance ), _type( type ), _distError( type == GEO_PLAIN ? g->_error : g->_errorSphere ), _farthest(0)
+ {}
+
+ virtual KeyResult approxKeyCheck( const Point& p, double& d ) {
+
+ // Always check approximate distance, since it lets us avoid doing
+ // checks of the rest of the object if it succeeds
+
+ switch (_type) {
+ case GEO_PLAIN:
+ d = _near.distance( p );
+ break;
+ case GEO_SPHERE:
+ checkEarthBounds( p );
+ d = spheredist_deg( _near, p );
+ break;
+ default: assert( false );
+ }
+ assert( d >= 0 );
+
+ GEODEBUG( "\t\t\t\t\t\t\t checkDistance " << _near.toString()
+ << "\t" << p.toString() << "\t" << d
+ << " farthest: " << farthest() );
+
+ // If we need more points
+ double borderDist = ( _points.size() < _max ? _maxDistance : farthest() );
+
+ if( d >= borderDist - 2 * _distError && d <= borderDist + 2 * _distError ) return BORDER;
+ else return d < borderDist ? GOOD : BAD;
+
+ }
+
+ virtual bool exactDocCheck( const Point& p, double& d ){
+
+ bool within = false;
+
+ // Get the appropriate distance for the type
+ switch ( _type ) {
+ case GEO_PLAIN:
+ d = _near.distance( p );
+ within = _near.distanceWithin( p, _maxDistance );
+ break;
+ case GEO_SPHERE:
+ checkEarthBounds( p );
+ d = spheredist_deg( _near, p );
+ within = ( d <= _maxDistance );
+ break;
+ default: assert( false );
+ }
+
+ return within;
+ }
+
+ // Always in distance units, whether radians or normal
+ double farthest() const {
+ return _farthest;
+ }
+
+ virtual int addSpecific( const GeoKeyNode& node, const Point& keyP, bool onBounds, double keyD, bool potentiallyNewDoc ) {
+
+ // Unique documents
+
+ GeoPoint newPoint( node, keyD, false );
+
+ int prevSize = _points.size();
+
+ // STEP 1 : Remove old duplicate points from the set if needed
+ if( _uniqueDocs ){
+
+ // Lookup old point with same doc
+ map< DiskLoc , Holder::iterator >::iterator oldPointIt = _seenPts.find( newPoint.loc() );
+
+ if( oldPointIt != _seenPts.end() ){
+ const GeoPoint& oldPoint = *(oldPointIt->second);
+ // We don't need to care if we've already seen this same approx pt or better,
+ // or we've already gone to disk once for the point
+ if( oldPoint < newPoint ){
+ GEODEBUG( "\t\tOld point closer than new point" );
+ return 0;
+ }
+ GEODEBUG( "\t\tErasing old point " << oldPointIt->first.obj() );
+ _points.erase( oldPointIt->second );
+ }
+ }
+
+ Holder::iterator newIt = _points.insert( newPoint );
+ if( _uniqueDocs ) _seenPts[ newPoint.loc() ] = newIt;
+
+ GEODEBUG( "\t\tInserted new point " << newPoint.toString() << " approx : " << keyD );
+
+ assert( _max > 0 );
+
+ Holder::iterator lastPtIt = _points.end();
+ lastPtIt--;
+ _farthest = lastPtIt->distance() + 2 * _distError;
+
+ return _points.size() - prevSize;
+
+ }
+
+ // Removes extra points from end of _points set.
+ // Check can be a bit costly if we have lots of exact points near borders,
+ // so we'll do this every once and awhile.
+ void processExtraPoints(){
+
+ if( _points.size() == 0 ) return;
+
+ int prevSize = _points.size();
+
+ // Erase all points from the set with a position >= _max *and*
+ // whose distance isn't close to the _max - 1 position distance
+
+ int numToErase = _points.size() - _max;
+ if( numToErase < 0 ) numToErase = 0;
+
+ // Get the first point definitely in the _points array
+ Holder::iterator startErase = _points.end();
+ for( int i = 0; i < numToErase + 1; i++ ) startErase--;
+ _farthest = startErase->distance() + 2 * _distError;
+
+ GEODEBUG( "\t\tPotentially erasing " << numToErase << " points, " << " size : " << _points.size() << " max : " << _max << " dist : " << startErase->distance() << " farthest dist : " << _farthest << " from error : " << _distError );
+
+ startErase++;
+ while( numToErase > 0 && startErase->distance() <= _farthest ){
+ GEODEBUG( "\t\tNot erasing point " << startErase->toString() );
+ numToErase--;
+ startErase++;
+ assert( startErase != _points.end() || numToErase == 0 );
+ }
+
+ if( _uniqueDocs ){
+ for( Holder::iterator i = startErase; i != _points.end(); ++i )
+ _seenPts.erase( i->loc() );
+ }
+
+ _points.erase( startErase, _points.end() );
+
+ int diff = _points.size() - prevSize;
+ if( diff > 0 ) _found += diff;
+ else _found -= -diff;
+
+ }
+
+ unsigned _max;
+ Point _near;
+ Holder _points;
+ double _maxDistance;
+ GeoDistType _type;
+ double _distError;
+ double _farthest;
+
+ // Safe to use currently since we don't yield in $near searches. If we do start to yield, we may need to
+ // replace dirtied disklocs in our holder / ensure our logic is correct.
+ map< DiskLoc , Holder::iterator > _seenPts;
+
+ };
+
+
+
+ class GeoSearch : public GeoHopper {
+ public:
+ GeoSearch( const Geo2dType * g , const Point& startPt , int numWanted=100 , BSONObj filter=BSONObj() , double maxDistance = numeric_limits<double>::max() , GeoDistType type=GEO_PLAIN, bool uniqueDocs = false, bool needDistance = false )
+ : GeoHopper( g , numWanted , startPt , filter , maxDistance, type, uniqueDocs, needDistance ),
+ _start( g->hash( startPt._x, startPt._y ) ),
+ // TODO: Remove numWanted...
+ _numWanted( numWanted ),
+ _type(type)
+ {
+
+ assert( g->getDetails() );
+ _nscanned = 0;
+ _found = 0;
+
+ if( _maxDistance < 0 ){
+ _scanDistance = numeric_limits<double>::max();
+ }
+ else if (type == GEO_PLAIN) {
+ _scanDistance = maxDistance + _spec->_error;
+ }
+ else if (type == GEO_SPHERE) {
+ checkEarthBounds( startPt );
+ // TODO: consider splitting into x and y scan distances
+ _scanDistance = computeXScanDistance( startPt._y, rad2deg( _maxDistance ) + _spec->_error );
+ }
+
+ assert( _scanDistance > 0 );
+
+ }
+
+
+ /** Check if we've already looked at a key. ALSO marks as seen, anticipating a follow-up call
+ to add(). This is broken out to avoid some work extracting the key bson if it's an
+ already seen point.
+ */
+ private:
+ set< pair<DiskLoc,int> > _seen;
+ public:
+
+ void exec() {
+
+ if( _numWanted == 0 ) return;
+
+ /*
+ * Search algorithm
+ * 1) use geohash prefix to find X items
+ * 2) compute max distance from want to an item
+ * 3) find optimal set of boxes that complete circle
+ * 4) use regular btree cursors to scan those boxes
+ */
+
+#ifdef GEODEBUGGING
+
+ log() << "start near search for " << _numWanted << " points near " << _near << " (max dist " << _maxDistance << ")" << endl;
+
+#endif
+
+ // Part 1
+ {
+ do {
+ long long f = found();
+ assert( f <= 0x7fffffff );
+ fillStack( maxPointsHeuristic, _numWanted - static_cast<int>(f) , true );
+ processExtraPoints();
+ } while( _state != DONE && _state != DONE_NEIGHBOR &&
+ found() < _numWanted &&
+ (! _prefix.constrains() || _g->sizeEdge( _prefix ) <= _scanDistance ) );
+
+ // If we couldn't scan or scanned everything, we're done
+ if( _state == DONE ){
+ expandEndPoints();
+ return;
+ }
+ }
+
+#ifdef GEODEBUGGING
+
+ log() << "part 1 of near search completed, found " << found() << " points (out of " << _foundInExp << " scanned)"
+ << " in expanded region " << _prefix << " @ " << Box( _g, _prefix )
+ << " with furthest distance " << farthest() << endl;
+
+#endif
+
+ // Part 2
+ {
+
+ // Find farthest distance for completion scan
+ double farDist = farthest();
+ if( found() < _numWanted ) {
+ // Not enough found in Phase 1
+ farDist = _scanDistance;
+ }
+ else if ( _type == GEO_PLAIN ) {
+ // Enough found, but need to search neighbor boxes
+ farDist += _spec->_error;
+ }
+ else if ( _type == GEO_SPHERE ) {
+ // Enough found, but need to search neighbor boxes
+ farDist = std::min( _scanDistance, computeXScanDistance( _near._y, rad2deg( farDist ) ) + 2 * _spec->_error );
+ }
+ assert( farDist >= 0 );
+ GEODEBUGPRINT( farDist );
+
+ // Find the box that includes all the points we need to return
+ _want = Box( _near._x - farDist , _near._y - farDist , farDist * 2 );
+ GEODEBUGPRINT( _want.toString() );
+
+ // log() << "Found : " << found() << " wanted : " << _numWanted << " Far distance : " << farDist << " box : " << _want << endl;
+
+ // Remember the far distance for further scans
+ _scanDistance = farDist;
+
+ // Reset the search, our distances have probably changed
+ if( _state == DONE_NEIGHBOR ){
+ _state = DOING_EXPAND;
+ _neighbor = -1;
+ }
+
+#ifdef GEODEBUGGING
+
+ log() << "resetting search with start at " << _start << " (edge length " << _g->sizeEdge( _start ) << ")" << endl;
+
+#endif
+
+ // Do regular search in the full region
+ do {
+ fillStack( maxPointsHeuristic );
+ processExtraPoints();
+ }
+ while( _state != DONE );
+
+ }
+
+ GEODEBUG( "done near search with " << _points.size() << " points " );
+
+ expandEndPoints();
+
+ }
+
+ void addExactPoints( const GeoPoint& pt, Holder& points, bool force ){
+ int before, after;
+ addExactPoints( pt, points, before, after, force );
+ }
+
+ void addExactPoints( const GeoPoint& pt, Holder& points, int& before, int& after, bool force ){
+
+ before = 0;
+ after = 0;
+
+ GEODEBUG( "Adding exact points for " << pt.toString() );
+
+ if( pt.isExact() ){
+ if( force ) points.insert( pt );
+ return;
+ }
+
+ vector<BSONObj> locs;
+ getPointsFor( pt.key(), pt.obj(), locs, _uniqueDocs );
+
+ GeoPoint nearestPt( pt, -1, true );
+
+ for( vector<BSONObj>::iterator i = locs.begin(); i != locs.end(); i++ ){
+
+ Point loc( *i );
+
+ double d;
+ if( ! exactDocCheck( loc, d ) ) continue;
+
+ if( _uniqueDocs && ( nearestPt.distance() < 0 || d < nearestPt.distance() ) ){
+ nearestPt._distance = d;
+ nearestPt._pt = *i;
+ continue;
+ }
+ else if( ! _uniqueDocs ){
+ GeoPoint exactPt( pt, d, true );
+ exactPt._pt = *i;
+ GEODEBUG( "Inserting exact pt " << exactPt.toString() << " for " << pt.toString() << " exact : " << d << " is less? " << ( exactPt < pt ) << " bits : " << _g->_bits );
+ points.insert( exactPt );
+ exactPt < pt ? before++ : after++;
+ }
+
+ }
+
+ if( _uniqueDocs && nearestPt.distance() >= 0 ){
+ GEODEBUG( "Inserting unique exact pt " << nearestPt.toString() << " for " << pt.toString() << " exact : " << nearestPt.distance() << " is less? " << ( nearestPt < pt ) << " bits : " << _g->_bits );
+ points.insert( nearestPt );
+ if( nearestPt < pt ) before++;
+ else after++;
+ }
+
+ }
+
+ // TODO: Refactor this back into holder class, allow to run periodically when we are seeing a lot of pts
+ void expandEndPoints( bool finish = true ){
+
+ processExtraPoints();
+
+ // All points in array *could* be in maxDistance
+
+ // Step 1 : Trim points to max size
+ // TODO: This check will do little for now, but is skeleton for future work in incremental $near
+ // searches
+ if( _max > 0 ){
+
+ int numToErase = _points.size() - _max;
+
+ if( numToErase > 0 ){
+
+ Holder tested;
+
+ // Work backward through all points we're not sure belong in the set
+ Holder::iterator maybePointIt = _points.end();
+ maybePointIt--;
+ double approxMin = maybePointIt->distance() - 2 * _distError;
+
+ GEODEBUG( "\t\tNeed to erase " << numToErase << " max : " << _max << " min dist " << approxMin << " error : " << _distError << " starting from : " << (*maybePointIt).toString() );
+
+ // Insert all
+ int erased = 0;
+ while( _points.size() > 0 && ( maybePointIt->distance() >= approxMin || erased < numToErase ) ){
+
+ Holder::iterator current = maybePointIt--;
+
+ addExactPoints( *current, tested, true );
+ _points.erase( current );
+ erased++;
+
+ if( tested.size() )
+ approxMin = tested.begin()->distance() - 2 * _distError;
+
+ }
+
+ GEODEBUG( "\t\tEnding search at point " << ( _points.size() == 0 ? "(beginning)" : maybePointIt->toString() ) );
+
+ int numToAddBack = erased - numToErase;
+ assert( numToAddBack >= 0 );
+
+ GEODEBUG( "\t\tNum tested valid : " << tested.size() << " erased : " << erased << " added back : " << numToAddBack );
+
+#ifdef GEODEBUGGING
+ for( Holder::iterator it = tested.begin(); it != tested.end(); it++ ){
+ log() << "Tested Point: " << *it << endl;
+ }
+#endif
+ Holder::iterator testedIt = tested.begin();
+ for( int i = 0; i < numToAddBack && testedIt != tested.end(); i++ ){
+ _points.insert( *testedIt );
+ testedIt++;
+ }
+ }
+ }
+
+#ifdef GEODEBUGGING
+ for( Holder::iterator it = _points.begin(); it != _points.end(); it++ ){
+ log() << "Point: " << *it << endl;
+ }
+#endif
+ // We've now trimmed first set of unneeded points
+
+ GEODEBUG( "\t\t Start expanding, num points : " << _points.size() << " max : " << _max );
+
+ // Step 2: iterate through all points and add as needed
+
+ unsigned expandedPoints = 0;
+ Holder::iterator it = _points.begin();
+ double expandWindowEnd = -1;
+ while( it != _points.end() ){
+ const GeoPoint& currPt = *it;
+
+ // TODO: If one point is exact, maybe not 2 * _distError
+
+ // See if we're in an expand window
+ bool inWindow = currPt.distance() <= expandWindowEnd;
+ // If we're not, and we're done with points, break
+ if( ! inWindow && expandedPoints >= _max ) break;
+
+ bool expandApprox = ! currPt.isExact() && ( ! _uniqueDocs || ( finish && _needDistance ) || inWindow );
+
+ if( expandApprox ){
+
+ // Add new point(s)
+ // These will only be added in a radius of 2 * _distError around the current point,
+ // so should not affect previously valid points.
+ int before, after;
+ addExactPoints( currPt, _points, before, after, false );
+ expandedPoints += before;
+
+ if( _max > 0 && expandedPoints < _max )
+ expandWindowEnd = currPt.distance() + 2 * _distError;
+
+ // Iterate to the next point
+ Holder::iterator current = it++;
+ // Erase the current point
+ _points.erase( current );
+
+ }
+ else{
+ expandedPoints++;
+ it++;
+ }
+ }
+
+ GEODEBUG( "\t\tFinished expanding, num points : " << _points.size() << " max : " << _max );
+
+ // Finish
+ // TODO: Don't really need to trim?
+ for( ; expandedPoints > _max; expandedPoints-- ) it--;
+ _points.erase( it, _points.end() );
+
+#ifdef GEODEBUGGING
+ for( Holder::iterator it = _points.begin(); it != _points.end(); it++ ){
+ log() << "Point: " << *it << endl;
+ }
+#endif
+ }
+
+ virtual GeoHash expandStartHash(){
+ return _start;
+ }
+
+ // Whether the current box width is big enough for our search area
+ virtual bool fitsInBox( double width ){
+ return width >= _scanDistance;
+ }
+
+ // Whether the current box overlaps our search area
+ virtual double intersectsBox( Box& cur ){
+ return cur.intersects( _want );
+ }
+
+ GeoHash _start;
+ int _numWanted;
+ double _scanDistance;
+
+ long long _nscanned;
+ int _found;
+ GeoDistType _type;
+
+ Box _want;
+ };
+
+ class GeoSearchCursor : public GeoCursorBase {
+ public:
+
+ GeoSearchCursor( shared_ptr<GeoSearch> s )
+ : GeoCursorBase( s->_spec ) ,
+ _s( s ) , _cur( s->_points.begin() ) , _end( s->_points.end() ), _nscanned() {
+ if ( _cur != _end ) {
+ ++_nscanned;
+ }
+ }
+
+ virtual ~GeoSearchCursor() {}
+
+ virtual bool ok() {
+ return _cur != _end;
+ }
+
+ virtual Record* _current() { assert(ok()); return _cur->_loc.rec(); }
+ virtual BSONObj current() { assert(ok()); return _cur->_o; }
+ virtual DiskLoc currLoc() { assert(ok()); return _cur->_loc; }
+ virtual bool advance() {
+ if( ok() ){
+ _cur++;
+ incNscanned();
+ return ok();
+ }
+ return false;
+ }
+ virtual BSONObj currKey() const { return _cur->_key; }
+
+ virtual string toString() {
+ return "GeoSearchCursor";
+ }
+
+
+ virtual BSONObj prettyStartKey() const {
+ return BSON( _s->_g->_geo << _s->_prefix.toString() );
+ }
+ virtual BSONObj prettyEndKey() const {
+ GeoHash temp = _s->_prefix;
+ temp.move( 1 , 1 );
+ return BSON( _s->_g->_geo << temp.toString() );
+ }
+
+ virtual long long nscanned() { return _nscanned; }
+
+ virtual CoveredIndexMatcher* matcher() const {
+ if( _s->_matcher.get() ) return _s->_matcher.get();
+ else return emptyMatcher.get();
+ }
+
+ virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const {
+ if( _s->_matcher.get() ) return _s->_matcher;
+ else return emptyMatcher;
+ }
+
+ shared_ptr<GeoSearch> _s;
+ GeoHopper::Holder::iterator _cur;
+ GeoHopper::Holder::iterator _end;
+
+ void incNscanned() { if ( ok() ) { ++_nscanned; } }
+ long long _nscanned;
+ };
+
+ class GeoCircleBrowse : public GeoBrowse {
+ public:
+
+ GeoCircleBrowse( const Geo2dType * g , const BSONObj& circle , BSONObj filter = BSONObj() , const string& type="$center", bool uniqueDocs = true )
+ : GeoBrowse( g , "circle" , filter, uniqueDocs ) {
+
+ uassert( 13060 , "$center needs 2 fields (middle,max distance)" , circle.nFields() == 2 );
+
+ BSONObjIterator i(circle);
+ BSONElement center = i.next();
+
+ uassert( 13656 , "the first field of $center object must be a location object" , center.isABSONObj() );
+
+ // Get geohash and exact center point
+ // TODO: For wrapping search, may be useful to allow center points outside-of-bounds here.
+ // Calculating the nearest point as a hash start inside the region would then be required.
+ _start = g->_tohash(center);
+ _startPt = Point(center);
+
+ _maxDistance = i.next().numberDouble();
+ uassert( 13061 , "need a max distance >= 0 " , _maxDistance >= 0 );
+
+ if (type == "$center") {
+ // Look in box with bounds of maxDistance in either direction
+ _type = GEO_PLAIN;
+ _xScanDistance = _maxDistance + _g->_error;
+ _yScanDistance = _maxDistance + _g->_error;
+ }
+ else if (type == "$centerSphere") {
+ // Same, but compute maxDistance using spherical transform
+
+ uassert(13461, "Spherical MaxDistance > PI. Are you sure you are using radians?", _maxDistance < M_PI);
+ checkEarthBounds( _startPt );
+
+ _type = GEO_SPHERE;
+ _yScanDistance = rad2deg( _maxDistance ) + _g->_error;
+ _xScanDistance = computeXScanDistance(_startPt._y, _yScanDistance);
+
+ uassert(13462, "Spherical distance would require wrapping, which isn't implemented yet",
+ (_startPt._x + _xScanDistance < 180) && (_startPt._x - _xScanDistance > -180) &&
+ (_startPt._y + _yScanDistance < 90) && (_startPt._y - _yScanDistance > -90));
+ }
+ else {
+ uassert(13460, "invalid $center query type: " + type, false);
+ }
+
+ // Bounding box includes fudge factor.
+ // TODO: Is this correct, since fudge factor may be spherically transformed?
+ _bBox._min = Point( _startPt._x - _xScanDistance, _startPt._y - _yScanDistance );
+ _bBox._max = Point( _startPt._x + _xScanDistance, _startPt._y + _yScanDistance );
+
+ GEODEBUG( "Bounding box for circle query : " << _bBox.toString() << " (max distance : " << _maxDistance << ")" << " starting from " << _startPt.toString() );
+
+ ok();
+ }
+
+ virtual GeoHash expandStartHash() {
+ return _start;
+ }
+
+ virtual bool fitsInBox( double width ) {
+ return width >= std::max(_xScanDistance, _yScanDistance);
+ }
+
+ virtual double intersectsBox( Box& cur ) {
+ return cur.intersects( _bBox );
+ }
+
+ virtual KeyResult approxKeyCheck( const Point& p, double& d ) {
+
+ // Inexact hash distance checks.
+ double error = 0;
+ switch (_type) {
+ case GEO_PLAIN:
+ d = _startPt.distance( p );
+ error = _g->_error;
+ break;
+ case GEO_SPHERE: {
+ checkEarthBounds( p );
+ d = spheredist_deg( _startPt, p );
+ error = _g->_errorSphere;
+ break;
+ }
+ default: assert( false );
+ }
+
+ // If our distance is in the error bounds...
+ if( d >= _maxDistance - error && d <= _maxDistance + error ) return BORDER;
+ return d > _maxDistance ? BAD : GOOD;
+ }
+
+ virtual bool exactDocCheck( const Point& p, double& d ){
+
+ switch (_type) {
+ case GEO_PLAIN: {
+ if( _startPt.distanceWithin( p, _maxDistance ) ) return true;
+ break;
+ }
+ case GEO_SPHERE:
+ checkEarthBounds( p );
+ if( spheredist_deg( _startPt , p ) <= _maxDistance ) return true;
+ break;
+ default: assert( false );
+ }
+
+ return false;
+ }
+
+ GeoDistType _type;
+ GeoHash _start;
+ Point _startPt;
+ double _maxDistance; // user input
+ double _xScanDistance; // effected by GeoDistType
+ double _yScanDistance; // effected by GeoDistType
+ Box _bBox;
+
+ };
+
+ class GeoBoxBrowse : public GeoBrowse {
+ public:
+
+ GeoBoxBrowse( const Geo2dType * g , const BSONObj& box , BSONObj filter = BSONObj(), bool uniqueDocs = true )
+ : GeoBrowse( g , "box" , filter, uniqueDocs ) {
+
+ uassert( 13063 , "$box needs 2 fields (bottomLeft,topRight)" , box.nFields() == 2 );
+
+ // Initialize an *exact* box from the given obj.
+ BSONObjIterator i(box);
+ _want._min = Point( i.next() );
+ _want._max = Point( i.next() );
+
+ _wantRegion = _want;
+ _wantRegion.fudge( g ); // Need to make sure we're checking regions within error bounds of where we want
+ fixBox( g, _wantRegion );
+ fixBox( g, _want );
+
+ uassert( 13064 , "need an area > 0 " , _want.area() > 0 );
+
+ Point center = _want.center();
+ _start = _g->hash( center._x , center._y );
+
+ GEODEBUG( "center : " << center.toString() << "\t" << _prefix );
+
+ _fudge = _g->_error;
+ _wantLen = _fudge +
+ std::max( ( _want._max._x - _want._min._x ) ,
+ ( _want._max._y - _want._min._y ) ) / 2;
+
+ ok();
+ }
+
+ void fixBox( const Geo2dType* g, Box& box ) {
+ if( box._min._x > box._max._x )
+ swap( box._min._x, box._max._x );
+ if( box._min._y > box._max._y )
+ swap( box._min._y, box._max._y );
+
+ double gMin = g->_min;
+ double gMax = g->_max;
+
+ if( box._min._x < gMin ) box._min._x = gMin;
+ if( box._min._y < gMin ) box._min._y = gMin;
+ if( box._max._x > gMax) box._max._x = gMax;
+ if( box._max._y > gMax ) box._max._y = gMax;
+ }
+
+ void swap( double& a, double& b ) {
+ double swap = a;
+ a = b;
+ b = swap;
+ }
+
+ virtual GeoHash expandStartHash() {
+ return _start;
+ }
+
+ virtual bool fitsInBox( double width ) {
+ return width >= _wantLen;
+ }
+
+ virtual double intersectsBox( Box& cur ) {
+ return cur.intersects( _wantRegion );
+ }
+
+ virtual KeyResult approxKeyCheck( const Point& p, double& d ) {
+ if( _want.onBoundary( p, _fudge ) ) return BORDER;
+ else return _want.inside( p, _fudge ) ? GOOD : BAD;
+
+ }
+
+ virtual bool exactDocCheck( const Point& p, double& d ){
+ return _want.inside( p );
+ }
+
+ Box _want;
+ Box _wantRegion;
+ double _wantLen;
+ double _fudge;
+
+ GeoHash _start;
+
+ };
+
+ class GeoPolygonBrowse : public GeoBrowse {
+ public:
+
+ GeoPolygonBrowse( const Geo2dType* g , const BSONObj& polyPoints ,
+ BSONObj filter = BSONObj(), bool uniqueDocs = true ) : GeoBrowse( g , "polygon" , filter, uniqueDocs ) {
+
+ GEODEBUG( "In Polygon" )
+
+ BSONObjIterator i( polyPoints );
+ BSONElement first = i.next();
+ _poly.add( Point( first ) );
+
+ while ( i.more() ) {
+ _poly.add( Point( i.next() ) );
+ }
+
+ uassert( 14030, "polygon must be defined by three points or more", _poly.size() >= 3 );
+
+ _bounds = _poly.bounds();
+ _bounds.fudge( g ); // We need to check regions within the error bounds of these bounds
+ _bounds.truncate( g ); // We don't need to look anywhere outside the space
+
+ _maxDim = _g->_error + _bounds.maxDim() / 2;
+
+ ok();
+ }
+
+ // The initial geo hash box for our first expansion
+ virtual GeoHash expandStartHash() {
+ return _g->hash( _bounds.center() );
+ }
+
+ // Whether the current box width is big enough for our search area
+ virtual bool fitsInBox( double width ) {
+ return _maxDim <= width;
+ }
+
+ // Whether the current box overlaps our search area
+ virtual double intersectsBox( Box& cur ) {
+ return cur.intersects( _bounds );
+ }
+
+ virtual KeyResult approxKeyCheck( const Point& p, double& d ) {
+
+ int in = _poly.contains( p, _g->_error );
+
+ if( in == 0 ) return BORDER;
+ else return in > 0 ? GOOD : BAD;
+
+ }
+
+ virtual bool exactDocCheck( const Point& p, double& d ){
+ return _poly.contains( p );
+ }
+
+ private:
+
+ Polygon _poly;
+ Box _bounds;
+ double _maxDim;
+
+ GeoHash _start;
+ };
+
+ shared_ptr<Cursor> Geo2dType::newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const {
+ if ( numWanted < 0 )
+ numWanted = numWanted * -1;
+ else if ( numWanted == 0 )
+ numWanted = 100;
+
+ BSONObjIterator i(query);
+ while ( i.more() ) {
+ BSONElement e = i.next();
+
+ if ( _geo != e.fieldName() )
+ continue;
+
+ if ( e.type() == Array ) {
+ // If we get an array query, assume it is a location, and do a $within { $center : [[x, y], 0] } search
+ shared_ptr<Cursor> c( new GeoCircleBrowse( this , BSON( "0" << e.embeddedObjectUserCheck() << "1" << 0 ), query.filterFieldsUndotted( BSON( _geo << "" ), false ), "$center", true ) );
+ return c;
+ }
+ else if ( e.type() == Object ) {
+
+ // TODO: Filter out _geo : { $special... } field so it doesn't get matched accidentally,
+ // if matcher changes
+
+ switch ( e.embeddedObject().firstElement().getGtLtOp() ) {
+ case BSONObj::opNEAR: {
+ BSONObj n = e.embeddedObject();
+ e = n.firstElement();
+
+ const char* suffix = e.fieldName() + 5; // strlen("$near") == 5;
+ GeoDistType type;
+ if (suffix[0] == '\0') {
+ type = GEO_PLAIN;
+ }
+ else if (strcmp(suffix, "Sphere") == 0) {
+ type = GEO_SPHERE;
+ }
+ else {
+ uassert(13464, string("invalid $near search type: ") + e.fieldName(), false);
+ type = GEO_PLAIN; // prevents uninitialized warning
+ }
+
+ double maxDistance = numeric_limits<double>::max();
+ if ( e.isABSONObj() && e.embeddedObject().nFields() > 2 ) {
+ BSONObjIterator i(e.embeddedObject());
+ i.next();
+ i.next();
+ BSONElement e = i.next();
+ if ( e.isNumber() )
+ maxDistance = e.numberDouble();
+ }
+ {
+ BSONElement e = n["$maxDistance"];
+ if ( e.isNumber() )
+ maxDistance = e.numberDouble();
+ }
+
+ bool uniqueDocs = false;
+ if( ! n["$uniqueDocs"].eoo() ) uniqueDocs = n["$uniqueDocs"].trueValue();
+
+ shared_ptr<GeoSearch> s( new GeoSearch( this , Point( e ) , numWanted , query , maxDistance, type, uniqueDocs ) );
+ s->exec();
+ shared_ptr<Cursor> c;
+ c.reset( new GeoSearchCursor( s ) );
+ return c;
+ }
+ case BSONObj::opWITHIN: {
+
+ e = e.embeddedObject().firstElement();
+ uassert( 13057 , "$within has to take an object or array" , e.isABSONObj() );
+
+ BSONObj context = e.embeddedObject();
+ e = e.embeddedObject().firstElement();
+ string type = e.fieldName();
+
+ bool uniqueDocs = true;
+ if( ! context["$uniqueDocs"].eoo() ) uniqueDocs = context["$uniqueDocs"].trueValue();
+
+ if ( startsWith(type, "$center") ) {
+ uassert( 13059 , "$center has to take an object or array" , e.isABSONObj() );
+ shared_ptr<Cursor> c( new GeoCircleBrowse( this , e.embeddedObjectUserCheck() , query , type, uniqueDocs ) );
+ return c;
+ }
+ else if ( type == "$box" ) {
+ uassert( 13065 , "$box has to take an object or array" , e.isABSONObj() );
+ shared_ptr<Cursor> c( new GeoBoxBrowse( this , e.embeddedObjectUserCheck() , query, uniqueDocs ) );
+ return c;
+ }
+ else if ( startsWith( type, "$poly" ) ) {
+ uassert( 14029 , "$polygon has to take an object or array" , e.isABSONObj() );
+ shared_ptr<Cursor> c( new GeoPolygonBrowse( this , e.embeddedObjectUserCheck() , query, uniqueDocs ) );
+ return c;
+ }
+ throw UserException( 13058 , str::stream() << "unknown $within information : " << context << ", a shape must be specified." );
+ }
+ default:
+ // Otherwise... assume the object defines a point, and we want to do a zero-radius $within $center
+ shared_ptr<Cursor> c( new GeoCircleBrowse( this , BSON( "0" << e.embeddedObjectUserCheck() << "1" << 0 ), query.filterFieldsUndotted( BSON( _geo << "" ), false ) ) );
+ return c;
+ }
+ }
+ }
+
+ throw UserException( 13042 , (string)"missing geo field (" + _geo + ") in : " + query.toString() );
+ }
+
+ // ------
+ // commands
+ // ------
+
+ class Geo2dFindNearCmd : public Command {
+ public:
+ Geo2dFindNearCmd() : Command( "geoNear" ) {}
+ virtual LockType locktype() const { return READ; }
+ bool slaveOk() const { return true; }
+ void help(stringstream& h) const { h << "http://www.mongodb.org/display/DOCS/Geospatial+Indexing#GeospatialIndexing-geoNearCommand"; }
+ bool slaveOverrideOk() { return true; }
+ bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ string ns = dbname + "." + cmdObj.firstElement().valuestr();
+
+ NamespaceDetails * d = nsdetails( ns.c_str() );
+ if ( ! d ) {
+ errmsg = "can't find ns";
+ return false;
+ }
+
+ vector<int> idxs;
+ d->findIndexByType( GEO2DNAME , idxs );
+
+ if ( idxs.size() > 1 ) {
+ errmsg = "more than 1 geo indexes :(";
+ return false;
+ }
+
+ if ( idxs.size() == 0 ) {
+ errmsg = "no geo index :(";
+ return false;
+ }
+
+ int geoIdx = idxs[0];
+
+ result.append( "ns" , ns );
+
+ IndexDetails& id = d->idx( geoIdx );
+ Geo2dType * g = (Geo2dType*)id.getSpec().getType();
+ assert( &id == g->getDetails() );
+
+ int numWanted = 100;
+ if ( cmdObj["num"].isNumber() ) {
+ numWanted = cmdObj["num"].numberInt();
+ assert( numWanted >= 0 );
+ }
+
+ bool uniqueDocs = false;
+ if( ! cmdObj["uniqueDocs"].eoo() ) uniqueDocs = cmdObj["uniqueDocs"].trueValue();
+
+ bool includeLocs = false;
+ if( ! cmdObj["includeLocs"].eoo() ) includeLocs = cmdObj["includeLocs"].trueValue();
+
+ uassert(13046, "'near' param missing/invalid", !cmdObj["near"].eoo());
+ const Point n( cmdObj["near"] );
+ result.append( "near" , g->_tohash( cmdObj["near"] ).toString() );
+
+ BSONObj filter;
+ if ( cmdObj["query"].type() == Object )
+ filter = cmdObj["query"].embeddedObject();
+
+ double maxDistance = numeric_limits<double>::max();
+ if ( cmdObj["maxDistance"].isNumber() )
+ maxDistance = cmdObj["maxDistance"].number();
+
+ GeoDistType type = GEO_PLAIN;
+ if ( cmdObj["spherical"].trueValue() )
+ type = GEO_SPHERE;
+
+ GeoSearch gs( g , n , numWanted , filter , maxDistance , type, uniqueDocs, true );
+
+ if ( cmdObj["start"].type() == String) {
+ GeoHash start ((string) cmdObj["start"].valuestr());
+ gs._start = start;
+ }
+
+ gs.exec();
+
+ double distanceMultiplier = 1;
+ if ( cmdObj["distanceMultiplier"].isNumber() )
+ distanceMultiplier = cmdObj["distanceMultiplier"].number();
+
+ double totalDistance = 0;
+
+ BSONObjBuilder arr( result.subarrayStart( "results" ) );
+ int x = 0;
+ for ( GeoHopper::Holder::iterator i=gs._points.begin(); i!=gs._points.end(); i++ ) {
+
+ const GeoPoint& p = *i;
+ double dis = distanceMultiplier * p.distance();
+ totalDistance += dis;
+
+ BSONObjBuilder bb( arr.subobjStart( BSONObjBuilder::numStr( x++ ) ) );
+ bb.append( "dis" , dis );
+ if( includeLocs ){
+ if( p._pt.couldBeArray() ) bb.append( "loc", BSONArray( p._pt ) );
+ else bb.append( "loc" , p._pt );
+ }
+ bb.append( "obj" , p._o );
+ bb.done();
+
+ if ( arr.len() > BSONObjMaxUserSize ) {
+ warning() << "Too many results to fit in single document. Truncating..." << endl;
+ break;
+ }
+ }
+ arr.done();
+
+ BSONObjBuilder stats( result.subobjStart( "stats" ) );
+ stats.append( "time" , cc().curop()->elapsedMillis() );
+ stats.appendNumber( "btreelocs" , gs._nscanned );
+ stats.appendNumber( "nscanned" , gs._lookedAt );
+ stats.appendNumber( "objectsLoaded" , gs._objectsLoaded );
+ stats.append( "avgDistance" , totalDistance / x );
+ stats.append( "maxDistance" , gs.farthest() );
+ stats.done();
+
+ return true;
+ }
+
+ } geo2dFindNearCmd;
+
+ class GeoWalkCmd : public Command {
+ public:
+ GeoWalkCmd() : Command( "geoWalk" ) {}
+ virtual LockType locktype() const { return READ; }
+ bool slaveOk() const { return true; }
+ bool slaveOverrideOk() { return true; }
+ bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ string ns = dbname + "." + cmdObj.firstElement().valuestr();
+
+ NamespaceDetails * d = nsdetails( ns.c_str() );
+ if ( ! d ) {
+ errmsg = "can't find ns";
+ return false;
+ }
+
+ int geoIdx = -1;
+ {
+ NamespaceDetails::IndexIterator ii = d->ii();
+ while ( ii.more() ) {
+ IndexDetails& id = ii.next();
+ if ( id.getSpec().getTypeName() == GEO2DNAME ) {
+ if ( geoIdx >= 0 ) {
+ errmsg = "2 geo indexes :(";
+ return false;
+ }
+ geoIdx = ii.pos() - 1;
+ }
+ }
+ }
+
+ if ( geoIdx < 0 ) {
+ errmsg = "no geo index :(";
+ return false;
+ }
+
+
+ IndexDetails& id = d->idx( geoIdx );
+ Geo2dType * g = (Geo2dType*)id.getSpec().getType();
+ assert( &id == g->getDetails() );
+
+ int max = 100000;
+
+ auto_ptr<BtreeCursor> bc( BtreeCursor::make( d , geoIdx , id , BSONObj() , BSONObj() , true , 1 ) );
+ BtreeCursor &c = *bc;
+ while ( c.ok() && max-- ) {
+ GeoHash h( c.currKey().firstElement() );
+ int len;
+ cout << "\t" << h.toString()
+ << "\t" << c.current()[g->_geo]
+ << "\t" << hex << h.getHash()
+ << "\t" << hex << ((long long*)c.currKey().firstElement().binData(len))[0]
+ << "\t" << c.current()["_id"]
+ << endl;
+ c.advance();
+ }
+
+ return true;
+ }
+
+ } geoWalkCmd;
+
+ struct GeoUnitTest : public UnitTest {
+
+ int round( double d ) {
+ return (int)(.5+(d*1000));
+ }
+
+#define GEOHEQ(a,b) if ( a.toString() != b ){ cout << "[" << a.toString() << "] != [" << b << "]" << endl; assert( a == GeoHash(b) ); }
+
+ void run() {
+ assert( ! GeoHash::isBitSet( 0 , 0 ) );
+ assert( ! GeoHash::isBitSet( 0 , 31 ) );
+ assert( GeoHash::isBitSet( 1 , 31 ) );
+
+ IndexSpec i( BSON( "loc" << "2d" ) );
+ Geo2dType g( &geo2dplugin , &i );
+ {
+ double x = 73.01212;
+ double y = 41.352964;
+ BSONObj in = BSON( "x" << x << "y" << y );
+ GeoHash h = g._hash( in );
+ BSONObj out = g._unhash( h );
+ assert( round(x) == round( out["x"].number() ) );
+ assert( round(y) == round( out["y"].number() ) );
+ assert( round( in["x"].number() ) == round( out["x"].number() ) );
+ assert( round( in["y"].number() ) == round( out["y"].number() ) );
+ }
+
+ {
+ double x = -73.01212;
+ double y = 41.352964;
+ BSONObj in = BSON( "x" << x << "y" << y );
+ GeoHash h = g._hash( in );
+ BSONObj out = g._unhash( h );
+ assert( round(x) == round( out["x"].number() ) );
+ assert( round(y) == round( out["y"].number() ) );
+ assert( round( in["x"].number() ) == round( out["x"].number() ) );
+ assert( round( in["y"].number() ) == round( out["y"].number() ) );
+ }
+
+ {
+ GeoHash h( "0000" );
+ h.move( 0 , 1 );
+ GEOHEQ( h , "0001" );
+ h.move( 0 , -1 );
+ GEOHEQ( h , "0000" );
+
+ h.init( "0001" );
+ h.move( 0 , 1 );
+ GEOHEQ( h , "0100" );
+ h.move( 0 , -1 );
+ GEOHEQ( h , "0001" );
+
+
+ h.init( "0000" );
+ h.move( 1 , 0 );
+ GEOHEQ( h , "0010" );
+ }
+
+ {
+ Box b( 5 , 5 , 2 );
+ assert( "(5,5) -->> (7,7)" == b.toString() );
+ }
+
+ {
+ GeoHash a = g.hash( 1 , 1 );
+ GeoHash b = g.hash( 4 , 5 );
+ assert( 5 == (int)(g.distance( a , b ) ) );
+ a = g.hash( 50 , 50 );
+ b = g.hash( 42 , 44 );
+ assert( round(10) == round(g.distance( a , b )) );
+ }
+
+ {
+ GeoHash x("0000");
+ assert( 0 == x.getHash() );
+ x.init( 0 , 1 , 32 );
+ GEOHEQ( x , "0000000000000000000000000000000000000000000000000000000000000001" )
+
+ assert( GeoHash( "1100").hasPrefix( GeoHash( "11" ) ) );
+ assert( ! GeoHash( "1000").hasPrefix( GeoHash( "11" ) ) );
+ }
+
+ {
+ GeoHash x("1010");
+ GEOHEQ( x , "1010" );
+ GeoHash y = x + "01";
+ GEOHEQ( y , "101001" );
+ }
+
+ {
+
+ GeoHash a = g.hash( 5 , 5 );
+ GeoHash b = g.hash( 5 , 7 );
+ GeoHash c = g.hash( 100 , 100 );
+ /*
+ cout << "a: " << a << endl;
+ cout << "b: " << b << endl;
+ cout << "c: " << c << endl;
+
+ cout << "a: " << a.toStringHex1() << endl;
+ cout << "b: " << b.toStringHex1() << endl;
+ cout << "c: " << c.toStringHex1() << endl;
+ */
+ BSONObj oa = a.wrap();
+ BSONObj ob = b.wrap();
+ BSONObj oc = c.wrap();
+ /*
+ cout << "a: " << oa.hexDump() << endl;
+ cout << "b: " << ob.hexDump() << endl;
+ cout << "c: " << oc.hexDump() << endl;
+ */
+ assert( oa.woCompare( ob ) < 0 );
+ assert( oa.woCompare( oc ) < 0 );
+
+ }
+
+ {
+ GeoHash x( "000000" );
+ x.move( -1 , 0 );
+ GEOHEQ( x , "101010" );
+ x.move( 1 , -1 );
+ GEOHEQ( x , "010101" );
+ x.move( 0 , 1 );
+ GEOHEQ( x , "000000" );
+ }
+
+ {
+ GeoHash prefix( "110011000000" );
+ GeoHash entry( "1100110000011100000111000001110000011100000111000001000000000000" );
+ assert( ! entry.hasPrefix( prefix ) );
+
+ entry = GeoHash("1100110000001100000111000001110000011100000111000001000000000000");
+ assert( entry.toString().find( prefix.toString() ) == 0 );
+ assert( entry.hasPrefix( GeoHash( "1100" ) ) );
+ assert( entry.hasPrefix( prefix ) );
+ }
+
+ {
+ GeoHash a = g.hash( 50 , 50 );
+ GeoHash b = g.hash( 48 , 54 );
+ assert( round( 4.47214 ) == round( g.distance( a , b ) ) );
+ }
+
+
+ {
+ Box b( Point( 29.762283 , -95.364271 ) , Point( 29.764283000000002 , -95.36227099999999 ) );
+ assert( b.inside( 29.763 , -95.363 ) );
+ assert( ! b.inside( 32.9570255 , -96.1082497 ) );
+ assert( ! b.inside( 32.9570255 , -96.1082497 , .01 ) );
+ }
+
+ {
+ GeoHash a( "11001111" );
+ assert( GeoHash( "11" ) == a.commonPrefix( GeoHash("11") ) );
+ assert( GeoHash( "11" ) == a.commonPrefix( GeoHash("11110000") ) );
+ }
+
+ {
+ int N = 10000;
+ {
+ Timer t;
+ for ( int i=0; i<N; i++ ) {
+ unsigned x = (unsigned)rand();
+ unsigned y = (unsigned)rand();
+ GeoHash h( x , y );
+ unsigned a,b;
+ h.unhash_slow( a,b );
+ assert( a == x );
+ assert( b == y );
+ }
+ //cout << "slow: " << t.millis() << endl;
+ }
+
+ {
+ Timer t;
+ for ( int i=0; i<N; i++ ) {
+ unsigned x = (unsigned)rand();
+ unsigned y = (unsigned)rand();
+ GeoHash h( x , y );
+ unsigned a,b;
+ h.unhash_fast( a,b );
+ assert( a == x );
+ assert( b == y );
+ }
+ //cout << "fast: " << t.millis() << endl;
+ }
+
+ }
+
+ {
+ // see http://en.wikipedia.org/wiki/Great-circle_distance#Worked_example
+
+ {
+ Point BNA (-86.67, 36.12);
+ Point LAX (-118.40, 33.94);
+
+ double dist1 = spheredist_deg(BNA, LAX);
+ double dist2 = spheredist_deg(LAX, BNA);
+
+ // target is 0.45306
+ assert( 0.45305 <= dist1 && dist1 <= 0.45307 );
+ assert( 0.45305 <= dist2 && dist2 <= 0.45307 );
+ }
+ {
+ Point BNA (-1.5127, 0.6304);
+ Point LAX (-2.0665, 0.5924);
+
+ double dist1 = spheredist_rad(BNA, LAX);
+ double dist2 = spheredist_rad(LAX, BNA);
+
+ // target is 0.45306
+ assert( 0.45305 <= dist1 && dist1 <= 0.45307 );
+ assert( 0.45305 <= dist2 && dist2 <= 0.45307 );
+ }
+ {
+ Point JFK (-73.77694444, 40.63861111 );
+ Point LAX (-118.40, 33.94);
+
+ double dist = spheredist_deg(JFK, LAX) * EARTH_RADIUS_MILES;
+ assert( dist > 2469 && dist < 2470 );
+ }
+
+ {
+ Point BNA (-86.67, 36.12);
+ Point LAX (-118.40, 33.94);
+ Point JFK (-73.77694444, 40.63861111 );
+ assert( spheredist_deg(BNA, BNA) < 1e-6);
+ assert( spheredist_deg(LAX, LAX) < 1e-6);
+ assert( spheredist_deg(JFK, JFK) < 1e-6);
+
+ Point zero (0, 0);
+ Point antizero (0,-180);
+
+ // these were known to cause NaN
+ assert( spheredist_deg(zero, zero) < 1e-6);
+ assert( fabs(M_PI-spheredist_deg(zero, antizero)) < 1e-6);
+ assert( fabs(M_PI-spheredist_deg(antizero, zero)) < 1e-6);
+ }
+ }
+ }
+ } geoUnitTest;
+
+
+}
+
diff --git a/src/mongo/db/geo/core.h b/src/mongo/db/geo/core.h
new file mode 100644
index 00000000000..c49131e0162
--- /dev/null
+++ b/src/mongo/db/geo/core.h
@@ -0,0 +1,550 @@
+// core.h
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../../pch.h"
+#include "../jsobj.h"
+
+#include <cmath>
+
+#ifndef M_PI
+# define M_PI 3.14159265358979323846
+#endif
+
+namespace mongo {
+
+ class GeoBitSets {
+ public:
+ GeoBitSets() {
+ for ( int i=0; i<32; i++ ) {
+ masks32[i] = ( 1 << ( 31 - i ) );
+ }
+ for ( int i=0; i<64; i++ ) {
+ masks64[i] = ( 1LL << ( 63 - i ) );
+ }
+
+ for ( unsigned i=0; i<16; i++ ) {
+ unsigned fixed = 0;
+ for ( int j=0; j<4; j++ ) {
+ if ( i & ( 1 << j ) )
+ fixed |= ( 1 << ( j * 2 ) );
+ }
+ hashedToNormal[fixed] = i;
+ }
+
+ long long currAllX = 0, currAllY = 0;
+ for ( int i = 0; i < 64; i++ ){
+ if( i % 2 == 0 ){
+ allX[ i / 2 ] = currAllX;
+ currAllX = currAllX + ( 1LL << ( 63 - i ) );
+ }
+ else{
+ allY[ i / 2 ] = currAllY;
+ currAllY = currAllY + ( 1LL << ( 63 - i ) );
+ }
+ }
+ }
+ int masks32[32];
+ long long masks64[64];
+ long long allX[32];
+ long long allY[32];
+
+ unsigned hashedToNormal[256];
+ };
+
+ extern GeoBitSets geoBitSets;
+
+ class GeoHash {
+ public:
+
+ GeoHash()
+ : _hash(0),_bits(0) {
+ }
+
+ explicit GeoHash( const char * hash ) {
+ init( hash );
+ }
+
+ explicit GeoHash( const string& hash ) {
+ init( hash );
+ }
+
+ static GeoHash makeFromBinData(const char *bindata, unsigned bits) {
+ GeoHash h;
+ h._bits = bits;
+ h._copy( (char*)&h._hash , bindata );
+ h._fix();
+ return h;
+ }
+
+ explicit GeoHash( const BSONElement& e , unsigned bits=32 ) {
+ _bits = bits;
+ if ( e.type() == BinData ) {
+ int len = 0;
+ _copy( (char*)&_hash , e.binData( len ) );
+ assert( len == 8 );
+ _bits = bits;
+ }
+ else {
+ cout << "GeoHash bad element: " << e << endl;
+ uassert(13047,"wrong type for geo index. if you're using a pre-release version, need to rebuild index",0);
+ }
+ _fix();
+ }
+
+ GeoHash( unsigned x , unsigned y , unsigned bits=32) {
+ init( x , y , bits );
+ }
+
+ GeoHash( const GeoHash& old ) {
+ _hash = old._hash;
+ _bits = old._bits;
+ }
+
+ GeoHash( long long hash , unsigned bits )
+ : _hash( hash ) , _bits( bits ) {
+ _fix();
+ }
+
+ void init( unsigned x , unsigned y , unsigned bits ) {
+ assert( bits <= 32 );
+ _hash = 0;
+ _bits = bits;
+ for ( unsigned i=0; i<bits; i++ ) {
+ if ( isBitSet( x , i ) ) _hash |= geoBitSets.masks64[i*2];
+ if ( isBitSet( y , i ) ) _hash |= geoBitSets.masks64[(i*2)+1];
+ }
+ }
+
+ void unhash_fast( unsigned& x , unsigned& y ) const {
+ x = 0;
+ y = 0;
+ char * c = (char*)(&_hash);
+ for ( int i=0; i<8; i++ ) {
+ unsigned t = (unsigned)(c[i]) & 0x55;
+ y |= ( geoBitSets.hashedToNormal[t] << (4*(i)) );
+
+ t = ( (unsigned)(c[i]) >> 1 ) & 0x55;
+ x |= ( geoBitSets.hashedToNormal[t] << (4*(i)) );
+ }
+ }
+
+ void unhash_slow( unsigned& x , unsigned& y ) const {
+ x = 0;
+ y = 0;
+ for ( unsigned i=0; i<_bits; i++ ) {
+ if ( getBitX(i) )
+ x |= geoBitSets.masks32[i];
+ if ( getBitY(i) )
+ y |= geoBitSets.masks32[i];
+ }
+ }
+
+ void unhash( unsigned& x , unsigned& y ) const {
+ unhash_fast( x , y );
+ }
+
+ /**
+ * @param 0 = high
+ */
+ static bool isBitSet( unsigned val , unsigned bit ) {
+ return geoBitSets.masks32[bit] & val;
+ }
+
+ GeoHash up() const {
+ return GeoHash( _hash , _bits - 1 );
+ }
+
+ bool hasPrefix( const GeoHash& other ) const {
+ assert( other._bits <= _bits );
+ if ( other._bits == 0 )
+ return true;
+ long long x = other._hash ^ _hash;
+ x = x >> (64-(other._bits*2));
+ return x == 0;
+ }
+
+
+ string toString() const {
+ StringBuilder buf( _bits * 2 );
+ for ( unsigned x=0; x<_bits*2; x++ )
+ buf.append( _hash & geoBitSets.masks64[x] ? "1" : "0" );
+ return buf.str();
+ }
+
+ string toStringHex1() const {
+ stringstream ss;
+ ss << hex << _hash;
+ return ss.str();
+ }
+
+ void init( const string& s ) {
+ _hash = 0;
+ _bits = s.size() / 2;
+ for ( unsigned pos=0; pos<s.size(); pos++ )
+ if ( s[pos] == '1' )
+ setBit( pos , 1 );
+ }
+
+ void setBit( unsigned pos , bool one ) {
+ assert( pos < _bits * 2 );
+ if ( one )
+ _hash |= geoBitSets.masks64[pos];
+ else if ( _hash & geoBitSets.masks64[pos] )
+ _hash &= ~geoBitSets.masks64[pos];
+ }
+
+ bool getBit( unsigned pos ) const {
+ return _hash & geoBitSets.masks64[pos];
+ }
+
+ bool getBitX( unsigned pos ) const {
+ assert( pos < 32 );
+ return getBit( pos * 2 );
+ }
+
+ bool getBitY( unsigned pos ) const {
+ assert( pos < 32 );
+ return getBit( ( pos * 2 ) + 1 );
+ }
+
+ BSONObj wrap( const char* name = "" ) const {
+ BSONObjBuilder b(20);
+ append( b , name );
+ BSONObj o = b.obj();
+ if( ! strlen( name ) ) assert( o.objsize() == 20 );
+ return o;
+ }
+
+ bool constrains() const {
+ return _bits > 0;
+ }
+
+ bool canRefine() const {
+ return _bits < 32;
+ }
+
+ bool atMinX() const {
+ return ( _hash & geoBitSets.allX[ _bits ] ) == 0;
+ }
+
+ bool atMinY() const {
+ //log() << " MinY : " << hex << (unsigned long long) _hash << " " << _bits << " " << hex << (unsigned long long) geoBitSets.allY[ _bits ] << endl;
+ return ( _hash & geoBitSets.allY[ _bits ] ) == 0;
+ }
+
+ bool atMaxX() const {
+ return ( _hash & geoBitSets.allX[ _bits ] ) == geoBitSets.allX[ _bits ];
+ }
+
+ bool atMaxY() const {
+ return ( _hash & geoBitSets.allY[ _bits ] ) == geoBitSets.allY[ _bits ];
+ }
+
+ void move( int x , int y ) {
+ assert( _bits );
+ _move( 0 , x );
+ _move( 1 , y );
+ }
+
+ void _move( unsigned offset , int d ) {
+ if ( d == 0 )
+ return;
+ assert( d <= 1 && d>= -1 ); // TEMP
+
+ bool from, to;
+ if ( d > 0 ) {
+ from = 0;
+ to = 1;
+ }
+ else {
+ from = 1;
+ to = 0;
+ }
+
+ unsigned pos = ( _bits * 2 ) - 1;
+ if ( offset == 0 )
+ pos--;
+ while ( true ) {
+ if ( getBit(pos) == from ) {
+ setBit( pos , to );
+ return;
+ }
+
+ if ( pos < 2 ) {
+ // overflow
+ for ( ; pos < ( _bits * 2 ) ; pos += 2 ) {
+ setBit( pos , from );
+ }
+ return;
+ }
+
+ setBit( pos , from );
+ pos -= 2;
+ }
+
+ assert(0);
+ }
+
+ GeoHash& operator=(const GeoHash& h) {
+ _hash = h._hash;
+ _bits = h._bits;
+ return *this;
+ }
+
+ bool operator==(const GeoHash& h ) const {
+ return _hash == h._hash && _bits == h._bits;
+ }
+
+ bool operator!=(const GeoHash& h ) const {
+ return !( *this == h );
+ }
+
+ bool operator<(const GeoHash& h ) const {
+ if( _hash != h._hash ) return _hash < h._hash;
+ return _bits < h._bits;
+ }
+
+ GeoHash& operator+=( const char * s ) {
+ unsigned pos = _bits * 2;
+ _bits += strlen(s) / 2;
+ assert( _bits <= 32 );
+ while ( s[0] ) {
+ if ( s[0] == '1' )
+ setBit( pos , 1 );
+ pos++;
+ s++;
+ }
+
+ return *this;
+ }
+
+ GeoHash operator+( const char * s ) const {
+ GeoHash n = *this;
+ n+=s;
+ return n;
+ }
+
+ GeoHash operator+( string s ) const {
+ return operator+( s.c_str() );
+ }
+
+ void _fix() {
+ static long long FULL = 0xFFFFFFFFFFFFFFFFLL;
+ long long mask = FULL << ( 64 - ( _bits * 2 ) );
+ _hash &= mask;
+ }
+
+ void append( BSONObjBuilder& b , const char * name ) const {
+ char buf[8];
+ _copy( buf , (char*)&_hash );
+ b.appendBinData( name , 8 , bdtCustom , buf );
+ }
+
+ long long getHash() const {
+ return _hash;
+ }
+
+ unsigned getBits() const {
+ return _bits;
+ }
+
+ GeoHash commonPrefix( const GeoHash& other ) const {
+ unsigned i=0;
+ for ( ; i<_bits && i<other._bits; i++ ) {
+ if ( getBitX( i ) == other.getBitX( i ) &&
+ getBitY( i ) == other.getBitY( i ) )
+ continue;
+ break;
+ }
+ return GeoHash(_hash,i);
+ }
+
+ private:
+
+ static void _copy( char * dst , const char * src ) {
+ for ( unsigned a=0; a<8; a++ ) {
+ dst[a] = src[7-a];
+ }
+ }
+
+ long long _hash;
+ unsigned _bits; // bits per field, so 1 to 32
+ };
+
+ inline ostream& operator<<( ostream &s, const GeoHash &h ) {
+ s << h.toString();
+ return s;
+ }
+
+ class GeoConvert {
+ public:
+ virtual ~GeoConvert() {}
+
+ virtual void unhash( const GeoHash& h , double& x , double& y ) const = 0;
+ virtual GeoHash hash( double x , double y ) const = 0;
+ };
+
+ class Point {
+ public:
+
+ Point( const GeoConvert * g , const GeoHash& hash ) {
+ g->unhash( hash , _x , _y );
+ }
+
+ explicit Point( const BSONElement& e ) {
+ BSONObjIterator i(e.Obj());
+ _x = i.next().number();
+ _y = i.next().number();
+ }
+
+ explicit Point( const BSONObj& o ) {
+ BSONObjIterator i(o);
+ _x = i.next().number();
+ _y = i.next().number();
+ }
+
+ Point( double x , double y )
+ : _x( x ) , _y( y ) {
+ }
+
+ Point() : _x(0),_y(0) {
+ }
+
+ GeoHash hash( const GeoConvert * g ) {
+ return g->hash( _x , _y );
+ }
+
+ double distance( const Point& p ) const {
+ double a = _x - p._x;
+ double b = _y - p._y;
+
+ // Avoid numerical error if possible...
+ if( a == 0 ) return abs( _y - p._y );
+ if( b == 0 ) return abs( _x - p._x );
+
+ return sqrt( ( a * a ) + ( b * b ) );
+ }
+
+ /**
+ * Distance method that compares x or y coords when other direction is zero,
+ * avoids numerical error when distances are very close to radius but axis-aligned.
+ *
+ * An example of the problem is:
+ * (52.0 - 51.9999) - 0.0001 = 3.31965e-15 and 52.0 - 51.9999 > 0.0001 in double arithmetic
+ * but:
+ * 51.9999 + 0.0001 <= 52.0
+ *
+ * This avoids some (but not all!) suprising results in $center queries where points are
+ * ( radius + center.x, center.y ) or vice-versa.
+ */
+ bool distanceWithin( const Point& p, double radius ) const {
+ double a = _x - p._x;
+ double b = _y - p._y;
+
+ if( a == 0 ) {
+ //
+ // Note: For some, unknown reason, when a 32-bit g++ optimizes this call, the sum is
+ // calculated imprecisely. We need to force the compiler to always evaluate it correctly,
+ // hence the weirdness.
+ //
+ // On some 32-bit linux machines, removing the volatile keyword or calculating the sum inline
+ // will make certain geo tests fail. Of course this check will force volatile for all 32-bit systems,
+ // not just affected systems.
+ if( sizeof(void*) <= 4 ){
+ volatile double sum = _y > p._y ? p._y + radius : _y + radius;
+ return _y > p._y ? sum >= _y : sum >= p._y;
+ }
+ else {
+ // Original math, correct for most systems
+ return _y > p._y ? p._y + radius >= _y : _y + radius >= p._y;
+ }
+ }
+ if( b == 0 ) {
+ if( sizeof(void*) <= 4 ){
+ volatile double sum = _x > p._x ? p._x + radius : _x + radius;
+ return _x > p._x ? sum >= _x : sum >= p._x;
+ }
+ else {
+ return _x > p._x ? p._x + radius >= _x : _x + radius >= p._x;
+ }
+ }
+
+ return sqrt( ( a * a ) + ( b * b ) ) <= radius;
+ }
+
+ string toString() const {
+ StringBuilder buf(32);
+ buf << "(" << _x << "," << _y << ")";
+ return buf.str();
+
+ }
+
+ double _x;
+ double _y;
+ };
+
+
+ extern const double EARTH_RADIUS_KM;
+ extern const double EARTH_RADIUS_MILES;
+
+ // Technically lat/long bounds, not really tied to earth radius.
+ inline void checkEarthBounds( Point p ) {
+ uassert( 14808, str::stream() << "point " << p.toString() << " must be in earth-like bounds of long : [-180, 180), lat : [-90, 90] ",
+ p._x >= -180 && p._x < 180 && p._y >= -90 && p._y <= 90 );
+ }
+
+ inline double deg2rad(double deg) { return deg * (M_PI/180); }
+ inline double rad2deg(double rad) { return rad * (180/M_PI); }
+
+ // WARNING: _x and _y MUST be longitude and latitude in that order
+ // note: multiply by earth radius for distance
+ inline double spheredist_rad( const Point& p1, const Point& p2 ) {
+ // this uses the n-vector formula: http://en.wikipedia.org/wiki/N-vector
+ // If you try to match the code to the formula, note that I inline the cross-product.
+ // TODO: optimize with SSE
+
+ double sin_x1(sin(p1._x)), cos_x1(cos(p1._x));
+ double sin_y1(sin(p1._y)), cos_y1(cos(p1._y));
+ double sin_x2(sin(p2._x)), cos_x2(cos(p2._x));
+ double sin_y2(sin(p2._y)), cos_y2(cos(p2._y));
+
+ double cross_prod =
+ (cos_y1*cos_x1 * cos_y2*cos_x2) +
+ (cos_y1*sin_x1 * cos_y2*sin_x2) +
+ (sin_y1 * sin_y2);
+
+ if (cross_prod >= 1 || cross_prod <= -1) {
+ // fun with floats
+ assert( fabs(cross_prod)-1 < 1e-6 );
+ return cross_prod > 0 ? 0 : M_PI;
+ }
+
+ return acos(cross_prod);
+ }
+
+ // note: return is still in radians as that can be multiplied by radius to get arc length
+ inline double spheredist_deg( const Point& p1, const Point& p2 ) {
+ return spheredist_rad(
+ Point( deg2rad(p1._x), deg2rad(p1._y) ),
+ Point( deg2rad(p2._x), deg2rad(p2._y) )
+ );
+ }
+
+}
diff --git a/src/mongo/db/geo/haystack.cpp b/src/mongo/db/geo/haystack.cpp
new file mode 100644
index 00000000000..104665087f6
--- /dev/null
+++ b/src/mongo/db/geo/haystack.cpp
@@ -0,0 +1,318 @@
+// db/geo/haystack.cpp
+
+/**
+ * Copyright (C) 2008 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../namespace-inl.h"
+#include "../jsobj.h"
+#include "../index.h"
+#include "../../util/unittest.h"
+#include "../commands.h"
+#include "../pdfile.h"
+#include "../btree.h"
+#include "../curop-inl.h"
+#include "../matcher.h"
+#include "core.h"
+#include "../../util/timer.h"
+
+#define GEOQUADDEBUG(x)
+//#define GEOQUADDEBUG(x) cout << x << endl
+
+/**
+ * this is a geo based search piece, which is different than regular geo lookup
+ * this is useful when you want to look for something within a region where the ratio is low
+ * works well for search for restaurants withing 25 miles with a certain name
+ * should not be used for finding the closest restaurants that are open
+ */
+namespace mongo {
+
+ string GEOSEARCHNAME = "geoHaystack";
+
+ class GeoHaystackSearchHopper {
+ public:
+ GeoHaystackSearchHopper( const BSONObj& n , double maxDistance , unsigned limit , const string& geoField )
+ : _near( n ) , _maxDistance( maxDistance ) , _limit( limit ) , _geoField(geoField) {
+
+ }
+
+ void got( const DiskLoc& loc ) {
+ Point p( loc.obj().getFieldDotted( _geoField ) );
+ if ( _near.distance( p ) > _maxDistance )
+ return;
+ _locs.push_back( loc );
+ }
+
+ int append( BSONArrayBuilder& b ) {
+ for ( unsigned i=0; i<_locs.size() && i<_limit; i++ )
+ b.append( _locs[i].obj() );
+ return _locs.size();
+ }
+
+ Point _near;
+ double _maxDistance;
+ unsigned _limit;
+ string _geoField;
+
+ vector<DiskLoc> _locs;
+ };
+
+ class GeoHaystackSearchIndex : public IndexType {
+
+ public:
+
+ GeoHaystackSearchIndex( const IndexPlugin* plugin , const IndexSpec* spec )
+ : IndexType( plugin , spec ) {
+
+ BSONElement e = spec->info["bucketSize"];
+ uassert( 13321 , "need bucketSize" , e.isNumber() );
+ _bucketSize = e.numberDouble();
+
+ BSONObjBuilder orderBuilder;
+
+ BSONObjIterator i( spec->keyPattern );
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ if ( e.type() == String && GEOSEARCHNAME == e.valuestr() ) {
+ uassert( 13314 , "can't have 2 geo fields" , _geo.size() == 0 );
+ uassert( 13315 , "2d has to be first in index" , _other.size() == 0 );
+ _geo = e.fieldName();
+ }
+ else {
+ _other.push_back( e.fieldName() );
+ }
+ orderBuilder.append( "" , 1 );
+ }
+
+ uassert( 13316 , "no geo field specified" , _geo.size() );
+ uassert( 13317 , "no other fields specified" , _other.size() );
+ uassert( 13326 , "quadrant search can only have 1 other field for now" , _other.size() == 1 );
+ _order = orderBuilder.obj();
+ }
+
+ int hash( const BSONElement& e ) const {
+ uassert( 13322 , "not a number" , e.isNumber() );
+ return hash( e.numberDouble() );
+ }
+
+ int hash( double d ) const {
+ d += 180;
+ d /= _bucketSize;
+ return (int)d;
+ }
+
+ string makeString( int hashedX , int hashedY ) const {
+ stringstream ss;
+ ss << hashedX << "_" << hashedY;
+ return ss.str();
+ }
+
+ void _add( const BSONObj& obj, const string& root , const BSONElement& e , BSONObjSet& keys ) const {
+ BSONObjBuilder buf;
+ buf.append( "" , root );
+ if ( e.eoo() )
+ buf.appendNull( "" );
+ else
+ buf.appendAs( e , "" );
+
+ BSONObj key = buf.obj();
+ GEOQUADDEBUG( obj << "\n\t" << root << "\n\t" << key );
+ keys.insert( key );
+ }
+
+ void getKeys( const BSONObj &obj, BSONObjSet &keys ) const {
+
+ BSONElement loc = obj.getFieldDotted( _geo );
+ if ( loc.eoo() )
+ return;
+
+ uassert( 13323 , "latlng not an array" , loc.isABSONObj() );
+ string root;
+ {
+ BSONObjIterator i( loc.Obj() );
+ BSONElement x = i.next();
+ BSONElement y = i.next();
+ root = makeString( hash(x) , hash(y) );
+ }
+
+
+ assert( _other.size() == 1 );
+
+ BSONElementSet all;
+ obj.getFieldsDotted( _other[0] , all );
+
+ if ( all.size() == 0 ) {
+ _add( obj , root , BSONElement() , keys );
+ }
+ else {
+ for ( BSONElementSet::iterator i=all.begin(); i!=all.end(); ++i ) {
+ _add( obj , root , *i , keys );
+ }
+ }
+
+ }
+
+ shared_ptr<Cursor> newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const {
+ shared_ptr<Cursor> c;
+ assert(0);
+ return c;
+ }
+
+ void searchCommand( NamespaceDetails* nsd , int idxNo ,
+ const BSONObj& n /*near*/ , double maxDistance , const BSONObj& search ,
+ BSONObjBuilder& result , unsigned limit ) {
+
+ Timer t;
+
+ log(1) << "SEARCH near:" << n << " maxDistance:" << maxDistance << " search: " << search << endl;
+ int x,y;
+ {
+ BSONObjIterator i( n );
+ x = hash( i.next() );
+ y = hash( i.next() );
+ }
+ int scale = (int)ceil( maxDistance / _bucketSize );
+
+ GeoHaystackSearchHopper hopper(n,maxDistance,limit,_geo);
+
+ long long btreeMatches = 0;
+
+ for ( int a=-scale; a<=scale; a++ ) {
+ for ( int b=-scale; b<=scale; b++ ) {
+
+ BSONObjBuilder bb;
+ bb.append( "" , makeString( x + a , y + b ) );
+ for ( unsigned i=0; i<_other.size(); i++ ) {
+ BSONElement e = search.getFieldDotted( _other[i] );
+ if ( e.eoo() )
+ bb.appendNull( "" );
+ else
+ bb.appendAs( e , "" );
+ }
+
+ BSONObj key = bb.obj();
+
+ GEOQUADDEBUG( "KEY: " << key );
+
+ set<DiskLoc> thisPass;
+ scoped_ptr<BtreeCursor> cursor( BtreeCursor::make( nsd , idxNo , *getDetails() , key , key , true , 1 ) );
+ while ( cursor->ok() ) {
+ pair<set<DiskLoc>::iterator, bool> p = thisPass.insert( cursor->currLoc() );
+ if ( p.second ) {
+ hopper.got( cursor->currLoc() );
+ GEOQUADDEBUG( "\t" << cursor->current() );
+ btreeMatches++;
+ }
+ cursor->advance();
+ }
+ }
+
+ }
+
+ BSONArrayBuilder arr( result.subarrayStart( "results" ) );
+ int num = hopper.append( arr );
+ arr.done();
+
+ {
+ BSONObjBuilder b( result.subobjStart( "stats" ) );
+ b.append( "time" , t.millis() );
+ b.appendNumber( "btreeMatches" , btreeMatches );
+ b.append( "n" , num );
+ b.done();
+ }
+ }
+
+ const IndexDetails* getDetails() const {
+ return _spec->getDetails();
+ }
+
+ string _geo;
+ vector<string> _other;
+
+ BSONObj _order;
+
+ double _bucketSize;
+ };
+
+ class GeoHaystackSearchIndexPlugin : public IndexPlugin {
+ public:
+ GeoHaystackSearchIndexPlugin() : IndexPlugin( GEOSEARCHNAME ) {
+ }
+
+ virtual IndexType* generate( const IndexSpec* spec ) const {
+ return new GeoHaystackSearchIndex( this , spec );
+ }
+
+ } nameIndexPlugin;
+
+
+ class GeoHaystackSearchCommand : public Command {
+ public:
+ GeoHaystackSearchCommand() : Command( "geoSearch" ) {}
+ virtual LockType locktype() const { return READ; }
+ bool slaveOk() const { return true; }
+ bool slaveOverrideOk() const { return true; }
+ bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+
+ string ns = dbname + "." + cmdObj.firstElement().valuestr();
+
+ NamespaceDetails * d = nsdetails( ns.c_str() );
+ if ( ! d ) {
+ errmsg = "can't find ns";
+ return false;
+ }
+
+ vector<int> idxs;
+ d->findIndexByType( GEOSEARCHNAME , idxs );
+ if ( idxs.size() == 0 ) {
+ errmsg = "no geoSearch index";
+ return false;
+ }
+ if ( idxs.size() > 1 ) {
+ errmsg = "more than 1 geosearch index";
+ return false;
+ }
+
+ int idxNum = idxs[0];
+
+ IndexDetails& id = d->idx( idxNum );
+ GeoHaystackSearchIndex * si = (GeoHaystackSearchIndex*)id.getSpec().getType();
+ assert( &id == si->getDetails() );
+
+ BSONElement n = cmdObj["near"];
+ BSONElement maxDistance = cmdObj["maxDistance"];
+ BSONElement search = cmdObj["search"];
+
+ uassert( 13318 , "near needs to be an array" , n.isABSONObj() );
+ uassert( 13319 , "maxDistance needs a number" , maxDistance.isNumber() );
+ uassert( 13320 , "search needs to be an object" , search.type() == Object );
+
+ unsigned limit = 50;
+ if ( cmdObj["limit"].isNumber() )
+ limit = (unsigned)cmdObj["limit"].numberInt();
+
+ si->searchCommand( d , idxNum , n.Obj() , maxDistance.numberDouble() , search.Obj() , result , limit );
+
+ return 1;
+ }
+
+ } nameSearchCommand;
+
+
+
+
+
+}
diff --git a/src/mongo/db/globals.h b/src/mongo/db/globals.h
new file mode 100644
index 00000000000..093bec76a0e
--- /dev/null
+++ b/src/mongo/db/globals.h
@@ -0,0 +1,54 @@
+// @file globals.h
+// grouping of global variables to make concurrency work clearer
+
+#pragma once
+
+namespace mongo {
+
+ void assertStartingUp();
+
+ // this is prototype for now, we'll see if it is helpful
+
+ /** "value is Const After Server Init" helper
+ *
+ * Example:
+ *
+ * casi<int> foo = 3;
+ * foo.ref() = 4; // asserts if not still in server init
+ * int x = foo+1; // ok anytime
+ *
+ */
+ template< class T >
+ class casi : boost::noncopyable {
+ T val;
+ public:
+ casi(const T& t) : val(t) {
+ DEV assertStartingUp();
+ }
+ operator const T& () { return val; }
+ T& ref() {
+ DEV assertStartingUp();
+ return val;
+ }
+ };
+
+ /** partially specialized for cases where out global variable is a pointer -- we want the value
+ * pointed at to be constant, not just the pointer itself
+ */
+ template< typename T >
+ class casi<T*> : boost::noncopyable {
+ T * val;
+ void operator=(T*);
+ public:
+ casi(T* t) : val(t) {
+ DEV assertStartingUp();
+ }
+ operator const T* () { return val; }
+ const T* get() { return val; }
+ T*& ref() {
+ DEV assertStartingUp();
+ return val;
+ }
+ };
+
+}
diff --git a/src/mongo/db/helpers/dblogger.h b/src/mongo/db/helpers/dblogger.h
new file mode 100644
index 00000000000..4d6ee6d78c4
--- /dev/null
+++ b/src/mongo/db/helpers/dblogger.h
@@ -0,0 +1,31 @@
+// @file db.logger.h
+
+/*
+ * Copyright (C) 2010 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+namespace mongo {
+
+ /** helper to log (and read log) of a capped collection in the database */
+ class DBLogger {
+ bool _inited;
+ public:
+ const string _ns;
+ DBLogger(string ns) : _inited(false), _ns(ns) { }
+ };
+
+}
diff --git a/src/mongo/db/index.cpp b/src/mongo/db/index.cpp
new file mode 100644
index 00000000000..5eaeab551df
--- /dev/null
+++ b/src/mongo/db/index.cpp
@@ -0,0 +1,446 @@
+/** @file index.cpp */
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "namespace-inl.h"
+#include "index.h"
+#include "btree.h"
+#include "background.h"
+#include "repl/rs.h"
+#include "ops/delete.h"
+
+
+namespace mongo {
+
+ template< class V >
+ class IndexInterfaceImpl : public IndexInterface {
+ public:
+ typedef typename V::KeyOwned KeyOwned;
+ typedef Continuation<V> Cont;
+ virtual int keyCompare(const BSONObj& l,const BSONObj& r, const Ordering &ordering);
+
+ Cont *c[NamespaceDetails::NIndexesMax];
+ int n;
+
+ public:
+ IndexInterfaceImpl() { n = 0; }
+
+ /* lacking CONCURRENCY WRITE this supports only one writer */
+ void _phasedBegin() {
+ // we do this here as phasedFinish can throw exceptions (we could catch there, but just as easy to do here)
+ for( int i = 0; i < n; i++ ) {
+ delete c[i];
+ c[i] = 0; // defensive
+ }
+ n = 0;
+ }
+ void phasedQueueItemToInsert(
+ int idxNo,
+ DiskLoc thisLoc, DiskLoc _recordLoc, const BSONObj &_key,
+ const Ordering& _order, IndexDetails& _idx, bool dupsAllowed)
+ {
+ if( idxNo >= n )
+ n = idxNo + 1;
+ Cont *C = c[idxNo] = new Cont(thisLoc, _recordLoc, _key, _order, _idx);
+ thisLoc.btree<V>()->twoStepInsert(thisLoc, *C, dupsAllowed);
+ }
+ void _phasedFinish() {
+ for( int i = 0; i < n; i++ ) {
+ // if mixing v0 and v1 indexes, in that case (only) there could be nulls in the list
+ if( c[i] ) {
+ c[i]->stepTwo();
+ }
+ }
+ }
+
+/* virtual DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order,
+ int& pos, bool& found, const DiskLoc &recordLoc, int direction) {
+ return thisLoc.btree<V>()->locate(idx, thisLoc, key, order, pos, found, recordLoc, direction);
+ }
+ */
+ virtual long long fullValidate(const DiskLoc& thisLoc, const BSONObj &order) {
+ return thisLoc.btree<V>()->fullValidate(thisLoc, order);
+ }
+ virtual DiskLoc findSingle(const IndexDetails &indexdetails , const DiskLoc& thisLoc, const BSONObj& key) const {
+ return thisLoc.btree<V>()->findSingle(indexdetails,thisLoc,key);
+ }
+ virtual bool unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc) const {
+ return thisLoc.btree<V>()->unindex(thisLoc, id, key, recordLoc);
+ }
+ virtual int bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+ const BSONObj& key, const Ordering &order, bool dupsAllowed,
+ IndexDetails& idx, bool toplevel = true) const {
+ return thisLoc.btree<V>()->bt_insert(thisLoc, recordLoc, key, order, dupsAllowed, idx, toplevel);
+ }
+ virtual DiskLoc addBucket(const IndexDetails& id) {
+ return BtreeBucket<V>::addBucket(id);
+ }
+ virtual void uassertIfDups(IndexDetails& idx, vector<BSONObj*>& addedKeys, DiskLoc head, DiskLoc self, const Ordering& ordering) {
+ const BtreeBucket<V> *h = head.btree<V>();
+ for( vector<BSONObj*>::iterator i = addedKeys.begin(); i != addedKeys.end(); i++ ) {
+ KeyOwned k(**i);
+ bool dup = h->wouldCreateDup(idx, head, k, ordering, self);
+ uassert( 11001 , h->dupKeyError( idx , k ) , !dup);
+ }
+ }
+
+ // for geo:
+ virtual bool isUsed(DiskLoc thisLoc, int pos) { return thisLoc.btree<V>()->isUsed(pos); }
+ virtual void keyAt(DiskLoc thisLoc, int pos, BSONObj& key, DiskLoc& recordLoc) {
+ recordLoc = DiskLoc();
+ const BtreeBucket<V>* bucket = thisLoc.btree<V>();
+ int n = bucket->nKeys();
+
+ if( pos < 0 || pos >= n || n == 0xffff /* bucket deleted */ || ! bucket->isUsed( pos ) ){
+ // log() << "Pos: " << pos << " n " << n << endl;
+ return;
+ }
+
+ typename BtreeBucket<V>::KeyNode kn = bucket->keyNode(pos);
+ key = kn.key.toBson();
+ recordLoc = kn.recordLoc;
+ }
+ virtual BSONObj keyAt(DiskLoc thisLoc, int pos) {
+ return thisLoc.btree<V>()->keyAt(pos).toBson();
+ }
+ virtual DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order,
+ int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) {
+ return thisLoc.btree<V>()->locate(idx, thisLoc, key, order, pos, found, recordLoc, direction);
+ }
+ virtual DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) {
+ return thisLoc.btree<V>()->advance(thisLoc,keyOfs,direction,caller);
+ }
+ };
+
+ int oldCompare(const BSONObj& l,const BSONObj& r, const Ordering &o); // key.cpp
+
+ template <>
+ int IndexInterfaceImpl< V0 >::keyCompare(const BSONObj& l, const BSONObj& r, const Ordering &ordering) {
+ return oldCompare(l, r, ordering);
+ }
+
+ template <>
+ int IndexInterfaceImpl< V1 >::keyCompare(const BSONObj& l, const BSONObj& r, const Ordering &ordering) {
+ return l.woCompare(r, ordering, /*considerfieldname*/false);
+ }
+
+ IndexInterfaceImpl<V0> iii_v0;
+ IndexInterfaceImpl<V1> iii_v1;
+
+ IndexInterface *IndexDetails::iis[] = { &iii_v0, &iii_v1 };
+
+ void IndexInterface::phasedBegin() {
+ iii_v0._phasedBegin();
+ iii_v1._phasedBegin();
+ }
+ void IndexInterface::phasedFinish() {
+ iii_v0._phasedFinish();
+ iii_v1._phasedFinish();
+ }
+
+ int removeFromSysIndexes(const char *ns, const char *idxName) {
+ string system_indexes = cc().database()->name + ".system.indexes";
+ BSONObjBuilder b;
+ b.append("ns", ns);
+ b.append("name", idxName); // e.g.: { name: "ts_1", ns: "foo.coll" }
+ BSONObj cond = b.done();
+ return (int) deleteObjects(system_indexes.c_str(), cond, false, false, true);
+ }
+
+ /* this is just an attempt to clean up old orphaned stuff on a delete all indexes
+ call. repair database is the clean solution, but this gives one a lighter weight
+ partial option. see dropIndexes()
+ */
+ void assureSysIndexesEmptied(const char *ns, IndexDetails *idIndex) {
+ string system_indexes = cc().database()->name + ".system.indexes";
+ BSONObjBuilder b;
+ b.append("ns", ns);
+ if( idIndex ) {
+ b.append("name", BSON( "$ne" << idIndex->indexName().c_str() ));
+ }
+ BSONObj cond = b.done();
+ int n = (int) deleteObjects(system_indexes.c_str(), cond, false, false, true);
+ if( n ) {
+ log() << "info: assureSysIndexesEmptied cleaned up " << n << " entries" << endl;
+ }
+ }
+
+ int IndexDetails::keyPatternOffset( const string& key ) const {
+ BSONObjIterator i( keyPattern() );
+ int n = 0;
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ if ( key == e.fieldName() )
+ return n;
+ n++;
+ }
+ return -1;
+ }
+
+ const IndexSpec& IndexDetails::getSpec() const {
+ SimpleMutex::scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
+ return NamespaceDetailsTransient::get_inlock( info.obj()["ns"].valuestr() ).getIndexSpec( this );
+ }
+
+ /* delete this index. does NOT clean up the system catalog
+ (system.indexes or system.namespaces) -- only NamespaceIndex.
+ */
+ void IndexDetails::kill_idx() {
+ string ns = indexNamespace(); // e.g. foo.coll.$ts_1
+ try {
+
+ string pns = parentNS(); // note we need a copy, as parentNS() won't work after the drop() below
+
+ // clean up parent namespace index cache
+ NamespaceDetailsTransient::get( pns.c_str() ).deletedIndex();
+
+ string name = indexName();
+
+ /* important to catch exception here so we can finish cleanup below. */
+ try {
+ dropNS(ns.c_str());
+ }
+ catch(DBException& ) {
+ log(2) << "IndexDetails::kill(): couldn't drop ns " << ns << endl;
+ }
+ head.setInvalid();
+ info.setInvalid();
+
+ // clean up in system.indexes. we do this last on purpose.
+ int n = removeFromSysIndexes(pns.c_str(), name.c_str());
+ wassert( n == 1 );
+
+ }
+ catch ( DBException &e ) {
+ log() << "exception in kill_idx: " << e << ", ns: " << ns << endl;
+ }
+ }
+
+ void IndexDetails::getKeysFromObject( const BSONObj& obj, BSONObjSet& keys) const {
+ getSpec().getKeys( obj, keys );
+ }
+
+ void setDifference(BSONObjSet &l, BSONObjSet &r, vector<BSONObj*> &diff) {
+ // l and r must use the same ordering spec.
+ verify( 14819, l.key_comp().order() == r.key_comp().order() );
+ BSONObjSet::iterator i = l.begin();
+ BSONObjSet::iterator j = r.begin();
+ while ( 1 ) {
+ if ( i == l.end() )
+ break;
+ while ( j != r.end() && j->woCompare( *i ) < 0 )
+ j++;
+ if ( j == r.end() || i->woCompare(*j) != 0 ) {
+ const BSONObj *jo = &*i;
+ diff.push_back( (BSONObj *) jo );
+ }
+ i++;
+ }
+ }
+
+ void getIndexChanges(vector<IndexChanges>& v, NamespaceDetails& d, BSONObj newObj, BSONObj oldObj, bool &changedId) {
+ int z = d.nIndexesBeingBuilt();
+ v.resize(z);
+ for( int i = 0; i < z; i++ ) {
+ IndexDetails& idx = d.idx(i);
+ BSONObj idxKey = idx.info.obj().getObjectField("key"); // eg { ts : 1 }
+ IndexChanges& ch = v[i];
+ idx.getKeysFromObject(oldObj, ch.oldkeys);
+ idx.getKeysFromObject(newObj, ch.newkeys);
+ if( ch.newkeys.size() > 1 )
+ d.setIndexIsMultikey(i);
+ setDifference(ch.oldkeys, ch.newkeys, ch.removed);
+ setDifference(ch.newkeys, ch.oldkeys, ch.added);
+ if ( ch.removed.size() > 0 && ch.added.size() > 0 && idx.isIdIndex() ) {
+ changedId = true;
+ }
+ }
+ }
+
+ void dupCheck(vector<IndexChanges>& v, NamespaceDetails& d, DiskLoc curObjLoc) {
+ int z = d.nIndexesBeingBuilt();
+ for( int i = 0; i < z; i++ ) {
+ IndexDetails& idx = d.idx(i);
+ v[i].dupCheck(idx, curObjLoc);
+ }
+ }
+
+ // should be { <something> : <simpletype[1|-1]>, .keyp.. }
+ static bool validKeyPattern(BSONObj kp) {
+ BSONObjIterator i(kp);
+ while( i.moreWithEOO() ) {
+ BSONElement e = i.next();
+ if( e.type() == Object || e.type() == Array )
+ return false;
+ }
+ return true;
+ }
+
+ /* Prepare to build an index. Does not actually build it (except for a special _id case).
+ - We validate that the params are good
+ - That the index does not already exist
+ - Creates the source collection if it DNE
+
+ example of 'io':
+ { ns : 'test.foo', name : 'z', key : { z : 1 } }
+
+ throws DBException
+
+ @param sourceNS - source NS we are indexing
+ @param sourceCollection - its details ptr
+ @return true if ok to continue. when false we stop/fail silently (index already exists)
+ */
+ bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject ) {
+ sourceCollection = 0;
+
+ // logical name of the index. todo: get rid of the name, we don't need it!
+ const char *name = io.getStringField("name");
+ uassert(12523, "no index name specified", *name);
+
+ // the collection for which we are building an index
+ sourceNS = io.getStringField("ns");
+ uassert(10096, "invalid ns to index", sourceNS.find( '.' ) != string::npos);
+ uassert(10097, "bad table to index name on add index attempt",
+ cc().database()->name == nsToDatabase(sourceNS.c_str()));
+
+ BSONObj key = io.getObjectField("key");
+ uassert(12524, "index key pattern too large", key.objsize() <= 2048);
+ if( !validKeyPattern(key) ) {
+ string s = string("bad index key pattern ") + key.toString();
+ uasserted(10098 , s.c_str());
+ }
+
+ if ( sourceNS.empty() || key.isEmpty() ) {
+ log(2) << "bad add index attempt name:" << (name?name:"") << "\n ns:" <<
+ sourceNS << "\n idxobj:" << io.toString() << endl;
+ string s = "bad add index attempt " + sourceNS + " key:" + key.toString();
+ uasserted(12504, s);
+ }
+
+ sourceCollection = nsdetails(sourceNS.c_str());
+ if( sourceCollection == 0 ) {
+ // try to create it
+ string err;
+ if ( !userCreateNS(sourceNS.c_str(), BSONObj(), err, false) ) {
+ problem() << "ERROR: failed to create collection while adding its index. " << sourceNS << endl;
+ return false;
+ }
+ sourceCollection = nsdetails(sourceNS.c_str());
+ tlog() << "info: creating collection " << sourceNS << " on add index" << endl;
+ assert( sourceCollection );
+ }
+
+ if ( sourceCollection->findIndexByName(name) >= 0 ) {
+ // index already exists.
+ return false;
+ }
+ if( sourceCollection->findIndexByKeyPattern(key) >= 0 ) {
+ log(2) << "index already exists with diff name " << name << ' ' << key.toString() << endl;
+ return false;
+ }
+
+ if ( sourceCollection->nIndexes >= NamespaceDetails::NIndexesMax ) {
+ stringstream ss;
+ ss << "add index fails, too many indexes for " << sourceNS << " key:" << key.toString();
+ string s = ss.str();
+ log() << s << '\n';
+ uasserted(12505,s);
+ }
+
+ /* we can't build a new index for the ns if a build is already in progress in the background -
+ EVEN IF this is a foreground build.
+ */
+ uassert(12588, "cannot add index with a background operation in progress",
+ !BackgroundOperation::inProgForNs(sourceNS.c_str()));
+
+ /* this is because we want key patterns like { _id : 1 } and { _id : <someobjid> } to
+ all be treated as the same pattern.
+ */
+ if ( IndexDetails::isIdIndexPattern(key) ) {
+ if( !god ) {
+ ensureHaveIdIndex( sourceNS.c_str() );
+ return false;
+ }
+ }
+ else {
+ /* is buildIndexes:false set for this replica set member?
+ if so we don't build any indexes except _id
+ */
+ if( theReplSet && !theReplSet->buildIndexes() )
+ return false;
+ }
+
+ string pluginName = IndexPlugin::findPluginName( key );
+ IndexPlugin * plugin = pluginName.size() ? IndexPlugin::get( pluginName ) : 0;
+
+
+ {
+ BSONObj o = io;
+ if ( plugin ) {
+ o = plugin->adjustIndexSpec(o);
+ }
+ BSONObjBuilder b;
+ int v = DefaultIndexVersionNumber;
+ if( !o["v"].eoo() ) {
+ double vv = o["v"].Number();
+ // note (one day) we may be able to fresh build less versions than we can use
+ // isASupportedIndexVersionNumber() is what we can use
+ uassert(14803, str::stream() << "this version of mongod cannot build new indexes of version number " << vv,
+ vv == 0 || vv == 1);
+ v = (int) vv;
+ }
+ // idea is to put things we use a lot earlier
+ b.append("v", v);
+ b.append(o["key"]);
+ if( o["unique"].trueValue() )
+ b.appendBool("unique", true); // normalize to bool true in case was int 1 or something...
+ b.append(o["ns"]);
+
+ {
+ // stripping _id
+ BSONObjIterator i(o);
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ string s = e.fieldName();
+ if( s != "_id" && s != "v" && s != "ns" && s != "unique" && s != "key" )
+ b.append(e);
+ }
+ }
+
+ fixedIndexObject = b.obj();
+ }
+
+ return true;
+ }
+
+ void IndexSpec::reset( const IndexDetails * details ) {
+ _details = details;
+ reset( details->info );
+ }
+
+ void IndexSpec::reset( const BSONObj& _info ) {
+ info = _info;
+ keyPattern = info["key"].embeddedObjectUserCheck();
+ if ( keyPattern.objsize() == 0 ) {
+ out() << info.toString() << endl;
+ assert(false);
+ }
+ _init();
+ }
+
+}
diff --git a/src/mongo/db/index.h b/src/mongo/db/index.h
new file mode 100644
index 00000000000..d297f8a4ca1
--- /dev/null
+++ b/src/mongo/db/index.h
@@ -0,0 +1,237 @@
+// index.h
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "diskloc.h"
+#include "jsobj.h"
+#include "indexkey.h"
+#include "key.h"
+
+namespace mongo {
+
+ class IndexInterface {
+ protected:
+ virtual ~IndexInterface() { }
+ public:
+ static void phasedBegin();
+ virtual void phasedQueueItemToInsert(
+ int idxNo,
+ DiskLoc thisLoc, DiskLoc _recordLoc, const BSONObj &_key,
+ const Ordering& _order, IndexDetails& _idx, bool dupsAllowed) = 0;
+ static void phasedFinish();
+
+ virtual int keyCompare(const BSONObj& l,const BSONObj& r, const Ordering &ordering) = 0;
+ virtual long long fullValidate(const DiskLoc& thisLoc, const BSONObj &order) = 0;
+ virtual DiskLoc findSingle(const IndexDetails &indexdetails , const DiskLoc& thisLoc, const BSONObj& key) const = 0;
+ virtual bool unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc) const = 0;
+ virtual int bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+ const BSONObj& key, const Ordering &order, bool dupsAllowed,
+ IndexDetails& idx, bool toplevel = true) const = 0;
+ virtual DiskLoc addBucket(const IndexDetails&) = 0;
+ virtual void uassertIfDups(IndexDetails& idx, vector<BSONObj*>& addedKeys, DiskLoc head,
+ DiskLoc self, const Ordering& ordering) = 0;
+
+ // these are for geo
+ virtual bool isUsed(DiskLoc thisLoc, int pos) = 0;
+ virtual void keyAt(DiskLoc thisLoc, int pos, BSONObj&, DiskLoc& recordLoc) = 0;
+ virtual BSONObj keyAt(DiskLoc thisLoc, int pos) = 0;
+ virtual DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order,
+ int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) = 0;
+ virtual DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) = 0;
+ };
+
+ /* Details about a particular index. There is one of these effectively for each object in
+ system.namespaces (although this also includes the head pointer, which is not in that
+ collection).
+
+ ** MemoryMapped Record ** (i.e., this is on disk data)
+ */
+ class IndexDetails {
+ public:
+ /**
+ * btree head disk location
+ * TODO We should make this variable private, since btree operations
+ * may change its value and we don't want clients to rely on an old
+ * value. If we create a btree class, we can provide a btree object
+ * to clients instead of 'head'.
+ */
+ DiskLoc head;
+
+ /* Location of index info object. Format:
+
+ { name:"nameofindex", ns:"parentnsname", key: {keypattobject}
+ [, unique: <bool>, background: <bool>, v:<version>]
+ }
+
+ This object is in the system.indexes collection. Note that since we
+ have a pointer to the object here, the object in system.indexes MUST NEVER MOVE.
+ */
+ DiskLoc info;
+
+ /* extract key value from the query object
+ e.g., if key() == { x : 1 },
+ { x : 70, y : 3 } -> { x : 70 }
+ */
+ BSONObj getKeyFromQuery(const BSONObj& query) const {
+ BSONObj k = keyPattern();
+ BSONObj res = query.extractFieldsUnDotted(k);
+ return res;
+ }
+
+ /* pull out the relevant key objects from obj, so we
+ can index them. Note that the set is multiple elements
+ only when it's a "multikey" array.
+ keys will be left empty if key not found in the object.
+ */
+ void getKeysFromObject( const BSONObj& obj, BSONObjSet& keys) const;
+
+ /* get the key pattern for this object.
+ e.g., { lastname:1, firstname:1 }
+ */
+ BSONObj keyPattern() const {
+ return info.obj().getObjectField("key");
+ }
+
+ /**
+ * @return offset into keyPattern for key
+ -1 if doesn't exist
+ */
+ int keyPatternOffset( const string& key ) const;
+ bool inKeyPattern( const string& key ) const { return keyPatternOffset( key ) >= 0; }
+
+ /* true if the specified key is in the index */
+ bool hasKey(const BSONObj& key);
+
+ // returns name of this index's storage area
+ // database.table.$index
+ string indexNamespace() const {
+ BSONObj io = info.obj();
+ string s;
+ s.reserve(Namespace::MaxNsLen);
+ s = io.getStringField("ns");
+ assert( !s.empty() );
+ s += ".$";
+ s += io.getStringField("name");
+ return s;
+ }
+
+ string indexName() const { // e.g. "ts_1"
+ BSONObj io = info.obj();
+ return io.getStringField("name");
+ }
+
+ static bool isIdIndexPattern( const BSONObj &pattern ) {
+ BSONObjIterator i(pattern);
+ BSONElement e = i.next();
+ if( strcmp(e.fieldName(), "_id") != 0 ) return false;
+ return i.next().eoo();
+ }
+
+ /* returns true if this is the _id index. */
+ bool isIdIndex() const {
+ return isIdIndexPattern( keyPattern() );
+ }
+
+ /* gets not our namespace name (indexNamespace for that),
+ but the collection we index, its name.
+ */
+ string parentNS() const {
+ BSONObj io = info.obj();
+ return io.getStringField("ns");
+ }
+
+ static int versionForIndexObj( const BSONObj &obj ) {
+ BSONElement e = obj["v"];
+ if( e.type() == NumberInt )
+ return e._numberInt();
+ // should normally be an int. this is for backward compatibility
+ int v = e.numberInt();
+ uassert(14802, "index v field should be Integer type", v == 0);
+ return v;
+ }
+
+ int version() const {
+ return versionForIndexObj( info.obj() );
+ }
+
+ /** @return true if index has unique constraint */
+ bool unique() const {
+ BSONObj io = info.obj();
+ return io["unique"].trueValue() ||
+ /* temp: can we juse make unique:true always be there for _id and get rid of this? */
+ isIdIndex();
+ }
+
+ /** return true if dropDups was set when building index (if any duplicates, dropdups drops the duplicating objects) */
+ bool dropDups() const {
+ return info.obj().getBoolField( "dropDups" );
+ }
+
+ /** delete this index. does NOT clean up the system catalog
+ (system.indexes or system.namespaces) -- only NamespaceIndex.
+ */
+ void kill_idx();
+
+ const IndexSpec& getSpec() const;
+
+ string toString() const {
+ return info.obj().toString();
+ }
+
+ /** @return true if supported. supported means we can use the index, including adding new keys.
+ it may not mean we can build the index version in question: we may not maintain building
+ of indexes in old formats in the future.
+ */
+ static bool isASupportedIndexVersionNumber(int v) { return (v&1)==v; } // v == 0 || v == 1
+
+ /** @return the interface for this interface, which varies with the index version.
+ used for backward compatibility of index versions/formats.
+ */
+ IndexInterface& idxInterface() const {
+ int v = version();
+ dassert( isASupportedIndexVersionNumber(v) );
+ return *iis[v&1];
+ }
+
+ static IndexInterface *iis[];
+ };
+
+ struct IndexChanges { /*on an update*/
+ BSONObjSet oldkeys;
+ BSONObjSet newkeys;
+ vector<BSONObj*> removed; // these keys were removed as part of the change
+ vector<BSONObj*> added; // these keys were added as part of the change
+
+ /** @curObjLoc - the object we want to add's location. if it is already in the
+ index, that is allowed here (for bg indexing case).
+ */
+ void dupCheck(IndexDetails& idx, DiskLoc curObjLoc) {
+ if( added.empty() || !idx.unique() )
+ return;
+ const Ordering ordering = Ordering::make(idx.keyPattern());
+ idx.idxInterface().uassertIfDups(idx, added, idx.head, curObjLoc, ordering); // "E11001 duplicate key on update"
+ }
+ };
+
+ class NamespaceDetails;
+ // changedId should be initialized to false
+ void getIndexChanges(vector<IndexChanges>& v, NamespaceDetails& d, BSONObj newObj, BSONObj oldObj, bool &cangedId);
+ void dupCheck(vector<IndexChanges>& v, NamespaceDetails& d, DiskLoc curObjLoc);
+} // namespace mongo
diff --git a/src/mongo/db/indexkey.cpp b/src/mongo/db/indexkey.cpp
new file mode 100644
index 00000000000..18dfcb079b9
--- /dev/null
+++ b/src/mongo/db/indexkey.cpp
@@ -0,0 +1,462 @@
+// index_key.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "namespace-inl.h"
+#include "index.h"
+#include "btree.h"
+#include "ops/query.h"
+#include "background.h"
+#include "../util/text.h"
+
+namespace mongo {
+
+ /** old (<= v1.8) : 0
+ 1 is new version
+ */
+ const int DefaultIndexVersionNumber = 1;
+
+ map<string,IndexPlugin*> * IndexPlugin::_plugins;
+
+ IndexType::IndexType( const IndexPlugin * plugin , const IndexSpec * spec )
+ : _plugin( plugin ) , _spec( spec ) {
+
+ }
+
+ IndexType::~IndexType() {
+ }
+
+ const BSONObj& IndexType::keyPattern() const {
+ return _spec->keyPattern;
+ }
+
+ IndexPlugin::IndexPlugin( const string& name )
+ : _name( name ) {
+ if ( ! _plugins )
+ _plugins = new map<string,IndexPlugin*>();
+ (*_plugins)[name] = this;
+ }
+
+ string IndexPlugin::findPluginName( const BSONObj& keyPattern ) {
+ string pluginName = "";
+
+ BSONObjIterator i( keyPattern );
+
+ while( i.more() ) {
+ BSONElement e = i.next();
+ if ( e.type() != String )
+ continue;
+
+ uassert( 13007 , "can only have 1 index plugin / bad index key pattern" , pluginName.size() == 0 || pluginName == e.String() );
+ pluginName = e.String();
+ }
+
+ return pluginName;
+ }
+
+ int IndexType::compare( const BSONObj& l , const BSONObj& r ) const {
+ return l.woCompare( r , _spec->keyPattern );
+ }
+
+ void IndexSpec::_init() {
+ assert( keyPattern.objsize() );
+
+ // some basics
+ _nFields = keyPattern.nFields();
+ _sparse = info["sparse"].trueValue();
+ uassert( 13529 , "sparse only works for single field keys" , ! _sparse || _nFields );
+
+
+ {
+ // build _nullKey
+
+ BSONObjBuilder b;
+ BSONObjIterator i( keyPattern );
+
+ while( i.more() ) {
+ BSONElement e = i.next();
+ _fieldNames.push_back( e.fieldName() );
+ _fixed.push_back( BSONElement() );
+ b.appendNull( "" );
+ }
+ _nullKey = b.obj();
+ }
+
+ {
+ // _nullElt
+ BSONObjBuilder b;
+ b.appendNull( "" );
+ _nullObj = b.obj();
+ _nullElt = _nullObj.firstElement();
+ }
+
+ {
+ // _undefinedElt
+ BSONObjBuilder b;
+ b.appendUndefined( "" );
+ _undefinedObj = b.obj();
+ _undefinedElt = _undefinedObj.firstElement();
+ }
+
+ {
+ // handle plugins
+ string pluginName = IndexPlugin::findPluginName( keyPattern );
+ if ( pluginName.size() ) {
+ IndexPlugin * plugin = IndexPlugin::get( pluginName );
+ if ( ! plugin ) {
+ log() << "warning: can't find plugin [" << pluginName << "]" << endl;
+ }
+ else {
+ _indexType.reset( plugin->generate( this ) );
+ }
+ }
+ }
+
+ _finishedInit = true;
+ }
+
+ void assertParallelArrays( const char *first, const char *second ) {
+ stringstream ss;
+ ss << "cannot index parallel arrays [" << first << "] [" << second << "]";
+ uasserted( ParallelArraysCode , ss.str() );
+ }
+
+ class KeyGeneratorV0 {
+ public:
+ KeyGeneratorV0( const IndexSpec &spec ) : _spec( spec ) {}
+
+ void getKeys( const BSONObj &obj, BSONObjSet &keys ) const {
+ if ( _spec._indexType.get() ) { //plugin (eg geo)
+ _spec._indexType->getKeys( obj , keys );
+ return;
+ }
+ vector<const char*> fieldNames( _spec._fieldNames );
+ vector<BSONElement> fixed( _spec._fixed );
+ _getKeys( fieldNames , fixed , obj, keys );
+ if ( keys.empty() && ! _spec._sparse )
+ keys.insert( _spec._nullKey );
+ }
+
+ private:
+ void _getKeys( vector<const char*> fieldNames , vector<BSONElement> fixed , const BSONObj &obj, BSONObjSet &keys ) const {
+ BSONElement arrElt;
+ unsigned arrIdx = ~0;
+ int numNotFound = 0;
+
+ for( unsigned i = 0; i < fieldNames.size(); ++i ) {
+ if ( *fieldNames[ i ] == '\0' )
+ continue;
+
+ BSONElement e = obj.getFieldDottedOrArray( fieldNames[ i ] );
+
+ if ( e.eoo() ) {
+ e = _spec._nullElt; // no matching field
+ numNotFound++;
+ }
+
+ if ( e.type() != Array )
+ fieldNames[ i ] = ""; // no matching field or non-array match
+
+ if ( *fieldNames[ i ] == '\0' )
+ fixed[ i ] = e; // no need for further object expansion (though array expansion still possible)
+
+ if ( e.type() == Array && arrElt.eoo() ) { // we only expand arrays on a single path -- track the path here
+ arrIdx = i;
+ arrElt = e;
+ }
+
+ // enforce single array path here
+ if ( e.type() == Array && e.rawdata() != arrElt.rawdata() ) {
+ assertParallelArrays( e.fieldName(), arrElt.fieldName() );
+ }
+ }
+
+ bool allFound = true; // have we found elements for all field names in the key spec?
+ for( vector<const char*>::const_iterator i = fieldNames.begin(); i != fieldNames.end(); ++i ) {
+ if ( **i != '\0' ) {
+ allFound = false;
+ break;
+ }
+ }
+
+ if ( _spec._sparse && numNotFound == _spec._nFields ) {
+ // we didn't find any fields
+ // so we're not going to index this document
+ return;
+ }
+
+ bool insertArrayNull = false;
+
+ if ( allFound ) {
+ if ( arrElt.eoo() ) {
+ // no terminal array element to expand
+ BSONObjBuilder b(_spec._sizeTracker);
+ for( vector< BSONElement >::iterator i = fixed.begin(); i != fixed.end(); ++i )
+ b.appendAs( *i, "" );
+ keys.insert( b.obj() );
+ }
+ else {
+ // terminal array element to expand, so generate all keys
+ BSONObjIterator i( arrElt.embeddedObject() );
+ if ( i.more() ) {
+ while( i.more() ) {
+ BSONObjBuilder b(_spec._sizeTracker);
+ for( unsigned j = 0; j < fixed.size(); ++j ) {
+ if ( j == arrIdx )
+ b.appendAs( i.next(), "" );
+ else
+ b.appendAs( fixed[ j ], "" );
+ }
+ keys.insert( b.obj() );
+ }
+ }
+ else if ( fixed.size() > 1 ) {
+ insertArrayNull = true;
+ }
+ }
+ }
+ else {
+ // nonterminal array element to expand, so recurse
+ assert( !arrElt.eoo() );
+ BSONObjIterator i( arrElt.embeddedObject() );
+ if ( i.more() ) {
+ while( i.more() ) {
+ BSONElement e = i.next();
+ if ( e.type() == Object ) {
+ _getKeys( fieldNames, fixed, e.embeddedObject(), keys );
+ }
+ }
+ }
+ else {
+ insertArrayNull = true;
+ }
+ }
+
+ if ( insertArrayNull ) {
+ // x : [] - need to insert undefined
+ BSONObjBuilder b(_spec._sizeTracker);
+ for( unsigned j = 0; j < fixed.size(); ++j ) {
+ if ( j == arrIdx ) {
+ b.appendUndefined( "" );
+ }
+ else {
+ BSONElement e = fixed[j];
+ if ( e.eoo() )
+ b.appendNull( "" );
+ else
+ b.appendAs( e , "" );
+ }
+ }
+ keys.insert( b.obj() );
+ }
+ }
+
+ const IndexSpec &_spec;
+ };
+
+ class KeyGeneratorV1 {
+ public:
+ KeyGeneratorV1( const IndexSpec &spec ) : _spec( spec ) {}
+
+ void getKeys( const BSONObj &obj, BSONObjSet &keys ) const {
+ if ( _spec._indexType.get() ) { //plugin (eg geo)
+ _spec._indexType->getKeys( obj , keys );
+ return;
+ }
+ vector<const char*> fieldNames( _spec._fieldNames );
+ vector<BSONElement> fixed( _spec._fixed );
+ _getKeys( fieldNames , fixed , obj, keys );
+ if ( keys.empty() && ! _spec._sparse )
+ keys.insert( _spec._nullKey );
+ }
+
+ private:
+ /**
+ * @param arrayNestedArray - set if the returned element is an array nested directly within arr.
+ */
+ BSONElement extractNextElement( const BSONObj &obj, const BSONObj &arr, const char *&field, bool &arrayNestedArray ) const {
+ string firstField = mongoutils::str::before( field, '.' );
+ bool haveObjField = !obj.getField( firstField ).eoo();
+ BSONElement arrField = arr.getField( firstField );
+ bool haveArrField = !arrField.eoo();
+
+ // An index component field name cannot exist in both a document array and one of that array's children.
+ uassert( 15855 , str::stream() << "Ambiguous field name found in array (do not use numeric field names in embedded elements in an array), field: '" << arrField.fieldName() << "' for array: " << arr, !haveObjField || !haveArrField );
+
+ arrayNestedArray = false;
+ if ( haveObjField ) {
+ return obj.getFieldDottedOrArray( field );
+ }
+ else if ( haveArrField ) {
+ if ( arrField.type() == Array ) {
+ arrayNestedArray = true;
+ }
+ return arr.getFieldDottedOrArray( field );
+ }
+ return BSONElement();
+ }
+
+ void _getKeysArrEltFixed( vector<const char*> &fieldNames , vector<BSONElement> &fixed , const BSONElement &arrEntry, BSONObjSet &keys, int numNotFound, const BSONElement &arrObjElt, const set< unsigned > &arrIdxs, bool mayExpandArrayUnembedded ) const {
+ // set up any terminal array values
+ for( set<unsigned>::const_iterator j = arrIdxs.begin(); j != arrIdxs.end(); ++j ) {
+ if ( *fieldNames[ *j ] == '\0' ) {
+ fixed[ *j ] = mayExpandArrayUnembedded ? arrEntry : arrObjElt;
+ }
+ }
+ // recurse
+ _getKeys( fieldNames, fixed, ( arrEntry.type() == Object ) ? arrEntry.embeddedObject() : BSONObj(), keys, numNotFound, arrObjElt.embeddedObject() );
+ }
+
+ /**
+ * @param fieldNames - fields to index, may be postfixes in recursive calls
+ * @param fixed - values that have already been identified for their index fields
+ * @param obj - object from which keys should be extracted, based on names in fieldNames
+ * @param keys - set where index keys are written
+ * @param numNotFound - number of index fields that have already been identified as missing
+ * @param array - array from which keys should be extracted, based on names in fieldNames
+ * If obj and array are both nonempty, obj will be one of the elements of array.
+ */
+ void _getKeys( vector<const char*> fieldNames , vector<BSONElement> fixed , const BSONObj &obj, BSONObjSet &keys, int numNotFound = 0, const BSONObj &array = BSONObj() ) const {
+ BSONElement arrElt;
+ set<unsigned> arrIdxs;
+ bool mayExpandArrayUnembedded = true;
+ for( unsigned i = 0; i < fieldNames.size(); ++i ) {
+ if ( *fieldNames[ i ] == '\0' ) {
+ continue;
+ }
+
+ bool arrayNestedArray;
+ // Extract element matching fieldName[ i ] from object xor array.
+ BSONElement e = extractNextElement( obj, array, fieldNames[ i ], arrayNestedArray );
+
+ if ( e.eoo() ) {
+ // if field not present, set to null
+ fixed[ i ] = _spec._nullElt;
+ // done expanding this field name
+ fieldNames[ i ] = "";
+ numNotFound++;
+ }
+ else if ( e.type() == Array ) {
+ arrIdxs.insert( i );
+ if ( arrElt.eoo() ) {
+ // we only expand arrays on a single path -- track the path here
+ arrElt = e;
+ }
+ else if ( e.rawdata() != arrElt.rawdata() ) {
+ // enforce single array path here
+ assertParallelArrays( e.fieldName(), arrElt.fieldName() );
+ }
+ if ( arrayNestedArray ) {
+ mayExpandArrayUnembedded = false;
+ }
+ }
+ else {
+ // not an array - no need for further expansion
+ fixed[ i ] = e;
+ }
+ }
+
+ if ( arrElt.eoo() ) {
+ // No array, so generate a single key.
+ if ( _spec._sparse && numNotFound == _spec._nFields ) {
+ return;
+ }
+ BSONObjBuilder b(_spec._sizeTracker);
+ for( vector< BSONElement >::iterator i = fixed.begin(); i != fixed.end(); ++i ) {
+ b.appendAs( *i, "" );
+ }
+ keys.insert( b.obj() );
+ }
+ else if ( arrElt.embeddedObject().firstElement().eoo() ) {
+ // Empty array, so set matching fields to undefined.
+ _getKeysArrEltFixed( fieldNames, fixed, _spec._undefinedElt, keys, numNotFound, arrElt, arrIdxs, true );
+ }
+ else {
+ // Non empty array that can be expanded, so generate a key for each member.
+ BSONObj arrObj = arrElt.embeddedObject();
+ BSONObjIterator i( arrObj );
+ while( i.more() ) {
+ _getKeysArrEltFixed( fieldNames, fixed, i.next(), keys, numNotFound, arrElt, arrIdxs, mayExpandArrayUnembedded );
+ }
+ }
+ }
+
+ const IndexSpec &_spec;
+ };
+
+ void IndexSpec::getKeys( const BSONObj &obj, BSONObjSet &keys ) const {
+ switch( indexVersion() ) {
+ case 0: {
+ KeyGeneratorV0 g( *this );
+ g.getKeys( obj, keys );
+ break;
+ }
+ case 1: {
+ KeyGeneratorV1 g( *this );
+ g.getKeys( obj, keys );
+ break;
+ }
+ default:
+ massert( 15869, "Invalid index version for key generation.", false );
+ }
+ }
+
+ bool anyElementNamesMatch( const BSONObj& a , const BSONObj& b ) {
+ BSONObjIterator x(a);
+ while ( x.more() ) {
+ BSONElement e = x.next();
+ BSONObjIterator y(b);
+ while ( y.more() ) {
+ BSONElement f = y.next();
+ FieldCompareResult res = compareDottedFieldNames( e.fieldName() , f.fieldName() );
+ if ( res == SAME || res == LEFT_SUBFIELD || res == RIGHT_SUBFIELD )
+ return true;
+ }
+ }
+ return false;
+ }
+
+ IndexSuitability IndexSpec::suitability( const BSONObj& query , const BSONObj& order ) const {
+ if ( _indexType.get() )
+ return _indexType->suitability( query , order );
+ return _suitability( query , order );
+ }
+
+ IndexSuitability IndexSpec::_suitability( const BSONObj& query , const BSONObj& order ) const {
+ // TODO: optimize
+ if ( anyElementNamesMatch( keyPattern , query ) == 0 && anyElementNamesMatch( keyPattern , order ) == 0 )
+ return USELESS;
+ return HELPFUL;
+ }
+
+ IndexSuitability IndexType::suitability( const BSONObj& query , const BSONObj& order ) const {
+ return _spec->_suitability( query , order );
+ }
+
+ int IndexSpec::indexVersion() const {
+ if ( !info.hasField( "v" ) ) {
+ return DefaultIndexVersionNumber;
+ }
+ return IndexDetails::versionForIndexObj( info );
+ }
+
+ bool IndexType::scanAndOrderRequired( const BSONObj& query , const BSONObj& order ) const {
+ return ! order.isEmpty();
+ }
+
+}
diff --git a/src/mongo/db/indexkey.h b/src/mongo/db/indexkey.h
new file mode 100644
index 00000000000..12cd755e8a0
--- /dev/null
+++ b/src/mongo/db/indexkey.h
@@ -0,0 +1,198 @@
+// index_key.h
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "diskloc.h"
+#include "jsobj.h"
+#include <map>
+
+namespace mongo {
+
+ extern const int DefaultIndexVersionNumber;
+
+ const int ParallelArraysCode = 10088;
+
+ class Cursor;
+ class IndexSpec;
+ class IndexType; // TODO: this name sucks
+ class IndexPlugin;
+ class IndexDetails;
+
+ enum IndexSuitability { USELESS = 0 , HELPFUL = 1 , OPTIMAL = 2 };
+
+ /**
+ * this represents an instance of a index plugin
+ * done this way so parsing, etc... can be cached
+ * so if there is a FTS IndexPlugin, for each index using FTS
+ * there will be 1 of these, and it can have things pre-parsed, etc...
+ */
+ class IndexType : boost::noncopyable {
+ public:
+ IndexType( const IndexPlugin * plugin , const IndexSpec * spec );
+ virtual ~IndexType();
+
+ virtual void getKeys( const BSONObj &obj, BSONObjSet &keys ) const = 0;
+ virtual shared_ptr<Cursor> newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const = 0;
+
+ /** optional op : changes query to match what's in the index */
+ virtual BSONObj fixKey( const BSONObj& in ) { return in; }
+
+ /** optional op : compare 2 objects with regards to this index */
+ virtual int compare( const BSONObj& l , const BSONObj& r ) const;
+
+ /** @return plugin */
+ const IndexPlugin * getPlugin() const { return _plugin; }
+
+ const BSONObj& keyPattern() const;
+
+ virtual IndexSuitability suitability( const BSONObj& query , const BSONObj& order ) const ;
+
+ virtual bool scanAndOrderRequired( const BSONObj& query , const BSONObj& order ) const ;
+
+ protected:
+ const IndexPlugin * _plugin;
+ const IndexSpec * _spec;
+ };
+
+ /**
+ * this represents a plugin
+ * a plugin could be something like full text search, sparse index, etc...
+ * 1 of these exists per type of index per server
+ * 1 IndexType is created per index using this plugin
+ */
+ class IndexPlugin : boost::noncopyable {
+ public:
+ IndexPlugin( const string& name );
+ virtual ~IndexPlugin() {}
+
+ virtual IndexType* generate( const IndexSpec * spec ) const = 0;
+
+ string getName() const { return _name; }
+
+ /**
+ * @return new keyPattern
+ * if nothing changes, should return keyPattern
+ */
+ virtual BSONObj adjustIndexSpec( const BSONObj& spec ) const { return spec; }
+
+ // ------- static below -------
+
+ static IndexPlugin* get( const string& name ) {
+ if ( ! _plugins )
+ return 0;
+ map<string,IndexPlugin*>::iterator i = _plugins->find( name );
+ if ( i == _plugins->end() )
+ return 0;
+ return i->second;
+ }
+
+ /**
+ * @param keyPattern { x : "fts" }
+ * @return "" or the name
+ */
+ static string findPluginName( const BSONObj& keyPattern );
+
+ private:
+ string _name;
+ static map<string,IndexPlugin*> * _plugins;
+ };
+
+ /* precomputed details about an index, used for inserting keys on updates
+ stored/cached in NamespaceDetailsTransient, or can be used standalone
+ */
+ class IndexSpec {
+ public:
+ BSONObj keyPattern; // e.g., { name : 1 }
+ BSONObj info; // this is the same as IndexDetails::info.obj()
+
+ IndexSpec()
+ : _details(0) , _finishedInit(false) {
+ }
+
+ explicit IndexSpec( const BSONObj& k , const BSONObj& m = BSONObj() )
+ : keyPattern(k) , info(m) , _details(0) , _finishedInit(false) {
+ _init();
+ }
+
+ /**
+ this is a DiscLoc of an IndexDetails info
+ should have a key field
+ */
+ explicit IndexSpec( const DiskLoc& loc ) {
+ reset( loc );
+ }
+
+ void reset( const BSONObj& info );
+ void reset( const DiskLoc& infoLoc ) { reset(infoLoc.obj()); }
+ void reset( const IndexDetails * details );
+
+ void getKeys( const BSONObj &obj, BSONObjSet &keys ) const;
+
+ BSONElement missingField() const { return _nullElt; }
+
+ string getTypeName() const {
+ if ( _indexType.get() )
+ return _indexType->getPlugin()->getName();
+ return "";
+ }
+
+ IndexType* getType() const {
+ return _indexType.get();
+ }
+
+ const IndexDetails * getDetails() const {
+ return _details;
+ }
+
+ IndexSuitability suitability( const BSONObj& query , const BSONObj& order ) const ;
+
+ protected:
+
+ int indexVersion() const;
+
+ IndexSuitability _suitability( const BSONObj& query , const BSONObj& order ) const ;
+
+ BSONSizeTracker _sizeTracker;
+ vector<const char*> _fieldNames;
+ vector<BSONElement> _fixed;
+
+ BSONObj _nullKey; // a full key with all fields null
+ BSONObj _nullObj; // only used for _nullElt
+ BSONElement _nullElt; // jstNull
+
+ BSONObj _undefinedObj; // only used for _undefinedElt
+ BSONElement _undefinedElt; // undefined
+
+ int _nFields; // number of fields in the index
+ bool _sparse; // if the index is sparse
+ shared_ptr<IndexType> _indexType;
+ const IndexDetails * _details;
+
+ void _init();
+
+ friend class IndexType;
+ friend class KeyGeneratorV0;
+ friend class KeyGeneratorV1;
+ public:
+ bool _finishedInit;
+ };
+
+
+} // namespace mongo
diff --git a/src/mongo/db/instance.cpp b/src/mongo/db/instance.cpp
new file mode 100644
index 00000000000..c8f8c6ea85b
--- /dev/null
+++ b/src/mongo/db/instance.cpp
@@ -0,0 +1,1148 @@
+// instance.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "db.h"
+#include "../bson/util/atomic_int.h"
+#include "introspect.h"
+#include "repl.h"
+#include "dbmessage.h"
+#include "instance.h"
+#include "lasterror.h"
+#include "security.h"
+#include "json.h"
+#include "replutil.h"
+#include "../s/d_logic.h"
+#include "../util/file_allocator.h"
+#include "../util/goodies.h"
+#include "cmdline.h"
+#if !defined(_WIN32)
+#include <sys/file.h>
+#endif
+#include "stats/counters.h"
+#include "background.h"
+#include "dur_journal.h"
+#include "dur_recover.h"
+#include "d_concurrency.h"
+#include "ops/count.h"
+#include "ops/delete.h"
+#include "ops/query.h"
+#include "ops/update.h"
+#include "pagefault.h"
+
+namespace mongo {
+
+ // "diaglog"
+ inline void opread(Message& m) { if( _diaglog.getLevel() & 2 ) _diaglog.readop((char *) m.singleData(), m.header()->len); }
+ inline void opwrite(Message& m) { if( _diaglog.getLevel() & 1 ) _diaglog.write((char *) m.singleData(), m.header()->len); }
+
+ void receivedKillCursors(Message& m);
+ void receivedUpdate(Message& m, CurOp& op);
+ void receivedDelete(Message& m, CurOp& op);
+ void receivedInsert(Message& m, CurOp& op);
+ bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop );
+
+ int nloggedsome = 0;
+#define LOGWITHRATELIMIT if( ++nloggedsome < 1000 || nloggedsome % 100 == 0 )
+
+ string dbExecCommand;
+
+ DiagLog _diaglog;
+
+ bool useCursors = true;
+ bool useHints = true;
+
+ KillCurrentOp killCurrentOp;
+
+ int lockFile = 0;
+#ifdef _WIN32
+ HANDLE lockFileHandle;
+#endif
+
+ // see FSyncCommand:
+ extern bool lockedForWriting;
+
+ OpTime OpTime::now() {
+ DEV d.dbMutex.assertWriteLocked();
+ return now_inlock();
+ }
+ OpTime OpTime::last_inlock(){
+ DEV d.dbMutex.assertAtLeastReadLocked();
+ return last;
+ }
+
+ // OpTime::now() uses dbMutex, thus it is in this file not in the cpp files used by drivers and such
+ void BSONElementManipulator::initTimestamp() {
+ massert( 10332 , "Expected CurrentTime type", _element.type() == Timestamp );
+ unsigned long long &timestamp = *( reinterpret_cast< unsigned long long* >( value() ) );
+ if ( timestamp == 0 )
+ timestamp = OpTime::now().asDate();
+ }
+ void BSONElementManipulator::SetNumber(double d) {
+ if ( _element.type() == NumberDouble )
+ *getDur().writing( reinterpret_cast< double * >( value() ) ) = d;
+ else if ( _element.type() == NumberInt )
+ *getDur().writing( reinterpret_cast< int * >( value() ) ) = (int) d;
+ else assert(0);
+ }
+ void BSONElementManipulator::SetLong(long long n) {
+ assert( _element.type() == NumberLong );
+ *getDur().writing( reinterpret_cast< long long * >(value()) ) = n;
+ }
+ void BSONElementManipulator::SetInt(int n) {
+ assert( _element.type() == NumberInt );
+ getDur().writingInt( *reinterpret_cast< int * >( value() ) ) = n;
+ }
+ /* dur:: version */
+ void BSONElementManipulator::ReplaceTypeAndValue( const BSONElement &e ) {
+ char *d = data();
+ char *v = value();
+ int valsize = e.valuesize();
+ int ofs = (int) (v-d);
+ dassert( ofs > 0 );
+ char *p = (char *) getDur().writingPtr(d, valsize + ofs);
+ *p = e.type();
+ memcpy( p + ofs, e.value(), valsize );
+ }
+
+ void inProgCmd( Message &m, DbResponse &dbresponse ) {
+ BSONObjBuilder b;
+
+ if( ! cc().isAdmin() ) {
+ b.append("err", "unauthorized");
+ }
+ else {
+ DbMessage d(m);
+ QueryMessage q(d);
+ bool all = q.query["$all"].trueValue();
+ vector<BSONObj> vals;
+ {
+ Client& me = cc();
+ scoped_lock bl(Client::clientsMutex);
+ auto_ptr<Matcher> m(new Matcher(q.query));
+ for( set<Client*>::iterator i = Client::clients.begin(); i != Client::clients.end(); i++ ) {
+ Client *c = *i;
+ assert( c );
+ CurOp* co = c->curop();
+ if ( c == &me && !co ) {
+ continue;
+ }
+ assert( co );
+ if( all || co->active() ) {
+ BSONObj info = co->infoNoauth();
+ if ( all || m->matches( info )) {
+ vals.push_back( info );
+ }
+ }
+ }
+ }
+ b.append("inprog", vals);
+ unsigned x = lockedForWriting;
+ if( x ) {
+ b.append("fsyncLock", x);
+ b.append("info", "use db.fsyncUnlock() to terminate the fsync write/snapshot lock");
+ }
+ }
+
+ replyToQuery(0, m, dbresponse, b.obj());
+ }
+
+ void killOp( Message &m, DbResponse &dbresponse ) {
+ BSONObj obj;
+ if( ! cc().isAdmin() ) {
+ obj = fromjson("{\"err\":\"unauthorized\"}");
+ }
+ /*else if( !dbMutexInfo.isLocked() )
+ obj = fromjson("{\"info\":\"no op in progress/not locked\"}");
+ */
+ else {
+ DbMessage d(m);
+ QueryMessage q(d);
+ BSONElement e = q.query.getField("op");
+ if( !e.isNumber() ) {
+ obj = fromjson("{\"err\":\"no op number field specified?\"}");
+ }
+ else {
+ log() << "going to kill op: " << e << endl;
+ obj = fromjson("{\"info\":\"attempting to kill op\"}");
+ killCurrentOp.kill( (unsigned) e.number() );
+ }
+ }
+ replyToQuery(0, m, dbresponse, obj);
+ }
+
+ void unlockFsyncAndWait();
+ void unlockFsync(const char *ns, Message& m, DbResponse &dbresponse) {
+ BSONObj obj;
+ if ( ! cc().isAdmin() ) { // checks auth
+ obj = fromjson("{\"err\":\"unauthorized\"}");
+ }
+ else if (strncmp(ns, "admin.", 6) != 0 ) {
+ obj = fromjson("{\"err\":\"unauthorized - this command must be run against the admin DB\"}");
+ }
+ else {
+ if( lockedForWriting ) {
+ log() << "command: unlock requested" << endl;
+ obj = fromjson("{ok:1,\"info\":\"unlock completed\"}");
+ unlockFsyncAndWait();
+ }
+ else {
+ obj = fromjson("{ok:0,\"errmsg\":\"not locked\"}");
+ }
+ }
+ replyToQuery(0, m, dbresponse, obj);
+ }
+
+ static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ) {
+ bool ok = true;
+ MSGID responseTo = m.header()->id;
+
+ DbMessage d(m);
+ QueryMessage q(d);
+ auto_ptr< Message > resp( new Message() );
+
+ CurOp& op = *(c.curop());
+
+ shared_ptr<AssertionException> ex;
+
+ try {
+ dbresponse.exhaust = runQuery(m, q, op, *resp);
+ assert( !resp->empty() );
+ }
+ catch ( SendStaleConfigException& e ){
+ ex.reset( new SendStaleConfigException( e.getns(), e.getInfo().msg ) );
+ ok = false;
+ }
+ catch ( AssertionException& e ) {
+ ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) );
+ ok = false;
+ }
+
+ if( ex ){
+
+ op.debug().exceptionInfo = ex->getInfo();
+ LOGWITHRATELIMIT {
+ log() << "assertion " << ex->toString() << " ns:" << q.ns << " query:" <<
+ (q.query.valid() ? q.query.toString() : "query object is corrupt") << endl;
+ if( q.ntoskip || q.ntoreturn )
+ log() << " ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << endl;
+ }
+
+ SendStaleConfigException* scex = NULL;
+ if ( ex->getCode() == SendStaleConfigCode ) scex = static_cast<SendStaleConfigException*>( ex.get() );
+
+ BSONObjBuilder err;
+ ex->getInfo().append( err );
+ if( scex ) err.append( "ns", scex->getns() );
+ BSONObj errObj = err.done();
+
+ log() << errObj << endl;
+
+ BufBuilder b;
+ b.skip(sizeof(QueryResult));
+ b.appendBuf((void*) errObj.objdata(), errObj.objsize());
+
+ // todo: call replyToQuery() from here instead of this!!! see dbmessage.h
+ QueryResult * msgdata = (QueryResult *) b.buf();
+ b.decouple();
+ QueryResult *qr = msgdata;
+ qr->_resultFlags() = ResultFlag_ErrSet;
+ if( scex ) qr->_resultFlags() |= ResultFlag_ShardConfigStale;
+ qr->len = b.len();
+ qr->setOperation(opReply);
+ qr->cursorId = 0;
+ qr->startingFrom = 0;
+ qr->nReturned = 1;
+ resp.reset( new Message() );
+ resp->setData( msgdata, true );
+
+ }
+
+ op.debug().responseLength = resp->header()->dataLen();
+
+ dbresponse.response = resp.release();
+ dbresponse.responseTo = responseTo;
+
+ return ok;
+ }
+
+ void (*reportEventToSystem)(const char *msg) = 0;
+
+ void mongoAbort(const char *msg) {
+ if( reportEventToSystem )
+ reportEventToSystem(msg);
+ rawOut(msg);
+ ::abort();
+ }
+
+ // Returns false when request includes 'end'
+ void _assembleResponse( Message &m, DbResponse &dbresponse, const HostAndPort& remote ) {
+
+ // before we lock...
+ int op = m.operation();
+ bool isCommand = false;
+ const char *ns = m.singleData()->_data + 4;
+ if ( op == dbQuery ) {
+ if( strstr(ns, ".$cmd") ) {
+ isCommand = true;
+ opwrite(m);
+ if( strstr(ns, ".$cmd.sys.") ) {
+ if( strstr(ns, "$cmd.sys.inprog") ) {
+ inProgCmd(m, dbresponse);
+ return;
+ }
+ if( strstr(ns, "$cmd.sys.killop") ) {
+ killOp(m, dbresponse);
+ return;
+ }
+ if( strstr(ns, "$cmd.sys.unlock") ) {
+ unlockFsync(ns, m, dbresponse);
+ return;
+ }
+ }
+ }
+ else {
+ opread(m);
+ }
+ }
+ else if( op == dbGetMore ) {
+ opread(m);
+ }
+ else {
+ opwrite(m);
+ }
+
+ globalOpCounters.gotOp( op , isCommand );
+
+ Client& c = cc();
+
+ auto_ptr<CurOp> nestedOp;
+ CurOp* currentOpP = c.curop();
+ if ( currentOpP->active() ) {
+ nestedOp.reset( new CurOp( &c , currentOpP ) );
+ currentOpP = nestedOp.get();
+ }
+ CurOp& currentOp = *currentOpP;
+ currentOp.reset(remote,op);
+
+ OpDebug& debug = currentOp.debug();
+ debug.op = op;
+
+ int logThreshold = cmdLine.slowMS;
+ bool log = logLevel >= 1;
+
+ if ( op == dbQuery ) {
+ if ( handlePossibleShardedMessage( m , &dbresponse ) )
+ return;
+ receivedQuery(c , dbresponse, m );
+ }
+ else if ( op == dbGetMore ) {
+ if ( ! receivedGetMore(dbresponse, m, currentOp) )
+ log = true;
+ }
+ else if ( op == dbMsg ) {
+ // deprecated - replaced by commands
+ char *p = m.singleData()->_data;
+ int len = strlen(p);
+ if ( len > 400 )
+ out() << curTimeMillis64() % 10000 <<
+ " long msg received, len:" << len << endl;
+
+ Message *resp = new Message();
+ if ( strcmp( "end" , p ) == 0 )
+ resp->setData( opReply , "dbMsg end no longer supported" );
+ else
+ resp->setData( opReply , "i am fine - dbMsg deprecated");
+
+ dbresponse.response = resp;
+ dbresponse.responseTo = m.header()->id;
+ }
+ else {
+ const char *ns = m.singleData()->_data + 4;
+ char cl[256];
+ nsToDatabase(ns, cl);
+ if( ! c.getAuthenticationInfo()->isAuthorized(cl) ) {
+ uassert_nothrow("unauthorized");
+ }
+ else {
+ try {
+ if ( op == dbInsert ) {
+ receivedInsert(m, currentOp);
+ }
+ else if ( op == dbUpdate ) {
+ receivedUpdate(m, currentOp);
+ }
+ else if ( op == dbDelete ) {
+ receivedDelete(m, currentOp);
+ }
+ else if ( op == dbKillCursors ) {
+ currentOp.ensureStarted();
+ logThreshold = 10;
+ receivedKillCursors(m);
+ }
+ else {
+ mongo::log() << " operation isn't supported: " << op << endl;
+ currentOp.done();
+ log = true;
+ }
+ }
+ catch ( UserException& ue ) {
+ tlog(3) << " Caught Assertion in " << opToString(op) << ", continuing " << ue.toString() << endl;
+ debug.exceptionInfo = ue.getInfo();
+ }
+ catch ( AssertionException& e ) {
+ tlog(3) << " Caught Assertion in " << opToString(op) << ", continuing " << e.toString() << endl;
+ debug.exceptionInfo = e.getInfo();
+ log = true;
+ }
+ }
+ }
+ currentOp.ensureStarted();
+ currentOp.done();
+ debug.executionTime = currentOp.totalTimeMillis();
+
+ //DEV log = true;
+ if ( log || debug.executionTime > logThreshold ) {
+ if( logLevel < 3 && op == dbGetMore && strstr(ns, ".oplog.") && debug.executionTime < 4300 && !log ) {
+ /* it's normal for getMore on the oplog to be slow because of use of awaitdata flag. */
+ }
+ else {
+ mongo::tlog() << debug << endl;
+ }
+ }
+
+ if ( currentOp.shouldDBProfile( debug.executionTime ) ) {
+ // performance profiling is on
+ if ( d.dbMutex.getState() < 0 ) {
+ mongo::log(1) << "note: not profiling because recursive read lock" << endl;
+ }
+ else {
+ writelock lk;
+ if ( dbHolder()._isLoaded( nsToDatabase( currentOp.getNS() ) , dbpath ) ) {
+ Client::Context cx( currentOp.getNS() );
+ profile(c , currentOp );
+ }
+ else {
+ mongo::log() << "note: not profiling because db went away - probably a close on: " << currentOp.getNS() << endl;
+ }
+ }
+ }
+
+ debug.reset();
+ } /* _assembleResponse() */
+
+ void assembleResponse( Message &m, DbResponse &dbresponse, const HostAndPort& remote ) {
+ PageFaultRetryableSection s;
+ while( 1 ) {
+ try {
+ _assembleResponse( m, dbresponse, remote );
+ break;
+ }
+ catch( PageFaultException& e ) {
+ DEV log() << "TEMP PageFaultException touch and retry" << endl;
+ e.touch();
+ }
+ }
+ }
+
+ void receivedKillCursors(Message& m) {
+ int *x = (int *) m.singleData()->_data;
+ x++; // reserved
+ int n = *x++;
+
+ uassert( 13659 , "sent 0 cursors to kill" , n != 0 );
+ massert( 13658 , str::stream() << "bad kill cursors size: " << m.dataSize() , m.dataSize() == 8 + ( 8 * n ) );
+ uassert( 13004 , str::stream() << "sent negative cursors to kill: " << n , n >= 1 );
+
+ if ( n > 2000 ) {
+ log( n < 30000 ? LL_WARNING : LL_ERROR ) << "receivedKillCursors, n=" << n << endl;
+ assert( n < 30000 );
+ }
+
+ int found = ClientCursor::erase(n, (long long *) x);
+
+ if ( logLevel > 0 || found != n ) {
+ log( found == n ) << "killcursors: found " << found << " of " << n << endl;
+ }
+
+ }
+
+ /* db - database name
+ path - db directory
+ */
+ /*static*/ void Database::closeDatabase( const char *db, const string& path ) {
+ assertInWriteLock();
+
+ Client::Context * ctx = cc().getContext();
+ assert( ctx );
+ assert( ctx->inDB( db , path ) );
+ Database *database = ctx->db();
+ assert( database->name == db );
+
+ oplogCheckCloseDatabase( database ); // oplog caches some things, dirty its caches
+
+ if( BackgroundOperation::inProgForDb(db) ) {
+ log() << "warning: bg op in prog during close db? " << db << endl;
+ }
+
+ /* important: kill all open cursors on the database */
+ string prefix(db);
+ prefix += '.';
+ ClientCursor::invalidate(prefix.c_str());
+
+ NamespaceDetailsTransient::clearForPrefix( prefix.c_str() );
+
+ dbHolderW().erase( db, path );
+ ctx->_clear();
+ delete database; // closes files
+ }
+
+ void receivedUpdate(Message& m, CurOp& op) {
+ DbMessage d(m);
+ const char *ns = d.getns();
+ op.debug().ns = ns;
+ int flags = d.pullInt();
+ BSONObj query = d.nextJsObj();
+
+ assert( d.moreJSObjs() );
+ assert( query.objsize() < m.header()->dataLen() );
+ BSONObj toupdate = d.nextJsObj();
+ uassert( 10055 , "update object too large", toupdate.objsize() <= BSONObjMaxUserSize);
+ assert( toupdate.objsize() < m.header()->dataLen() );
+ assert( query.objsize() + toupdate.objsize() < m.header()->dataLen() );
+ bool upsert = flags & UpdateOption_Upsert;
+ bool multi = flags & UpdateOption_Multi;
+ bool broadcast = flags & UpdateOption_Broadcast;
+
+ op.debug().query = query;
+ op.setQuery(query);
+
+ writelock lk;
+
+ // void ReplSetImpl::relinquish() uses big write lock so
+ // this is thus synchronized given our lock above.
+ uassert( 10054 , "not master", isMasterNs( ns ) );
+
+ // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit
+ if ( ! broadcast && handlePossibleShardedMessage( m , 0 ) )
+ return;
+
+ Client::Context ctx( ns );
+
+ UpdateResult res = updateObjects(ns, toupdate, query, upsert, multi, true, op.debug() );
+ lastError.getSafe()->recordUpdate( res.existing , res.num , res.upserted ); // for getlasterror
+ }
+
+ void receivedDelete(Message& m, CurOp& op) {
+ DbMessage d(m);
+ const char *ns = d.getns();
+ op.debug().ns = ns;
+ int flags = d.pullInt();
+ bool justOne = flags & RemoveOption_JustOne;
+ bool broadcast = flags & RemoveOption_Broadcast;
+ assert( d.moreJSObjs() );
+ BSONObj pattern = d.nextJsObj();
+
+ op.debug().query = pattern;
+ op.setQuery(pattern);
+
+ writelock lk(ns);
+
+ // writelock is used to synchronize stepdowns w/ writes
+ uassert( 10056 , "not master", isMasterNs( ns ) );
+
+ // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit
+ if ( ! broadcast && handlePossibleShardedMessage( m , 0 ) )
+ return;
+
+ Client::Context ctx(ns);
+
+ long long n = deleteObjects(ns, pattern, justOne, true);
+ lastError.getSafe()->recordDelete( n );
+ }
+
+ QueryResult* emptyMoreResult(long long);
+
+ void OpTime::waitForDifferent(unsigned millis){
+ DEV d.dbMutex.assertAtLeastReadLocked();
+
+ if (*this != last) return; // check early
+
+ boost::xtime timeout;
+ boost::xtime_get(&timeout, boost::TIME_UTC);
+
+ timeout.nsec += millis * 1000*1000;
+ if (timeout.nsec >= 1000*1000*1000){
+ timeout.nsec -= 1000*1000*1000;
+ timeout.sec += 1;
+ }
+
+ do {
+ dbtemprelease tmp;
+ boost::mutex::scoped_lock lk(notifyMutex());
+ if (!notifier().timed_wait(lk, timeout))
+ return; // timed out
+ } while (*this != last);
+ }
+
+ bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ) {
+ bool ok = true;
+
+ DbMessage d(m);
+
+ const char *ns = d.getns();
+ int ntoreturn = d.pullInt();
+ long long cursorid = d.pullInt64();
+
+ curop.debug().ns = ns;
+ curop.debug().ntoreturn = ntoreturn;
+ curop.debug().cursorid = cursorid;
+
+ time_t start = 0;
+ int pass = 0;
+ bool exhaust = false;
+ QueryResult* msgdata;
+ OpTime last;
+ while( 1 ) {
+ try {
+ Client::ReadContext ctx(ns);
+ if (str::startsWith(ns, "local.oplog.")){
+ if (pass == 0)
+ last = OpTime::last_inlock();
+ else
+ last.waitForDifferent(1000/*ms*/);
+ }
+ msgdata = processGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust);
+ }
+ catch ( AssertionException& e ) {
+ exhaust = false;
+ curop.debug().exceptionInfo = e.getInfo();
+ msgdata = emptyMoreResult(cursorid);
+ ok = false;
+ }
+ if (msgdata == 0) {
+ exhaust = false;
+ massert(13073, "shutting down", !inShutdown() );
+ if( pass == 0 ) {
+ start = time(0);
+ }
+ else {
+ if( time(0) - start >= 4 ) {
+ // after about 4 seconds, return. pass stops at 1000 normally.
+ // we want to return occasionally so slave can checkpoint.
+ pass = 10000;
+ }
+ }
+ pass++;
+ if (debug)
+ sleepmillis(20);
+ else
+ sleepmillis(2);
+ continue;
+ }
+ break;
+ };
+
+ Message *resp = new Message();
+ resp->setData(msgdata, true);
+ curop.debug().responseLength = resp->header()->dataLen();
+ curop.debug().nreturned = msgdata->nReturned;
+
+ dbresponse.response = resp;
+ dbresponse.responseTo = m.header()->id;
+
+ if( exhaust ) {
+ curop.debug().exhaust = true;
+ dbresponse.exhaust = ns;
+ }
+
+ return ok;
+ }
+
+ void checkAndInsert(const char *ns, /*modifies*/BSONObj& js) {
+ uassert( 10059 , "object to insert too large", js.objsize() <= BSONObjMaxUserSize);
+ {
+ // check no $ modifiers. note we only check top level. (scanning deep would be quite expensive)
+ BSONObjIterator i( js );
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ uassert( 13511 , "document to insert can't have $ fields" , e.fieldName()[0] != '$' );
+ }
+ }
+ theDataFileMgr.insertWithObjMod(ns, js, false); // js may be modified in the call to add an _id field.
+ logOp("i", ns, js);
+ }
+
+ NOINLINE_DECL void insertMulti(bool keepGoing, const char *ns, vector<BSONObj>& objs) {
+ size_t i;
+ for (i=0; i<objs.size(); i++){
+ try {
+ checkAndInsert(ns, objs[i]);
+ getDur().commitIfNeeded();
+ } catch (const UserException&) {
+ if (!keepGoing || i == objs.size()-1){
+ globalOpCounters.incInsertInWriteLock(i);
+ throw;
+ }
+ // otherwise ignore and keep going
+ }
+ }
+
+ globalOpCounters.incInsertInWriteLock(i);
+ }
+
+ void receivedInsert(Message& m, CurOp& op) {
+ DbMessage d(m);
+ const char *ns = d.getns();
+ op.debug().ns = ns;
+
+ if( !d.moreJSObjs() ) {
+ // strange. should we complain?
+ return;
+ }
+ BSONObj first = d.nextJsObj();
+
+ vector<BSONObj> multi;
+ while (d.moreJSObjs()){
+ if (multi.empty()) // first pass
+ multi.push_back(first);
+ multi.push_back( d.nextJsObj() );
+ }
+
+ writelock lk(ns);
+ //LockCollectionExclusively lk(ns);
+
+ // CONCURRENCY TODO: is being read locked in big log sufficient here?
+ // writelock is used to synchronize stepdowns w/ writes
+ uassert( 10058 , "not master", isMasterNs(ns) );
+
+ if ( handlePossibleShardedMessage( m , 0 ) )
+ return;
+
+ Client::Context ctx(ns);
+
+ if( !multi.empty() ) {
+ const bool keepGoing = d.reservedField() & InsertOption_ContinueOnError;
+ insertMulti(keepGoing, ns, multi);
+ return;
+ }
+
+ checkAndInsert(ns, first);
+ globalOpCounters.incInsertInWriteLock(1);
+ }
+
+ void getDatabaseNames( vector< string > &names , const string& usePath ) {
+ boost::filesystem::path path( usePath );
+ for ( boost::filesystem::directory_iterator i( path );
+ i != boost::filesystem::directory_iterator(); ++i ) {
+ if ( directoryperdb ) {
+ boost::filesystem::path p = *i;
+ string dbName = p.leaf();
+ p /= ( dbName + ".ns" );
+ if ( MMF::exists( p ) )
+ names.push_back( dbName );
+ }
+ else {
+ string fileName = boost::filesystem::path(*i).leaf();
+ if ( fileName.length() > 3 && fileName.substr( fileName.length() - 3, 3 ) == ".ns" )
+ names.push_back( fileName.substr( 0, fileName.length() - 3 ) );
+ }
+ }
+ }
+
+ /* returns true if there is data on this server. useful when starting replication.
+ local database does NOT count except for rsoplog collection.
+ used to set the hasData field on replset heartbeat command response
+ */
+ bool replHasDatabases() {
+ vector<string> names;
+ getDatabaseNames(names);
+ if( names.size() >= 2 ) return true;
+ if( names.size() == 1 ) {
+ if( names[0] != "local" )
+ return true;
+ // we have a local database. return true if oplog isn't empty
+ {
+ readlock lk(rsoplog);
+ BSONObj o;
+ if( Helpers::getFirst(rsoplog, o) )
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool DBDirectClient::call( Message &toSend, Message &response, bool assertOk , string * actualServer ) {
+ if ( lastError._get() )
+ lastError.startRequest( toSend, lastError._get() );
+ DbResponse dbResponse;
+ assembleResponse( toSend, dbResponse , _clientHost );
+ assert( dbResponse.response );
+ dbResponse.response->concat(); // can get rid of this if we make response handling smarter
+ response = *dbResponse.response;
+ getDur().commitIfNeeded();
+ return true;
+ }
+
+ void DBDirectClient::say( Message &toSend, bool isRetry ) {
+ if ( lastError._get() )
+ lastError.startRequest( toSend, lastError._get() );
+ DbResponse dbResponse;
+ assembleResponse( toSend, dbResponse , _clientHost );
+ getDur().commitIfNeeded();
+ }
+
+ auto_ptr<DBClientCursor> DBDirectClient::query(const string &ns, Query query, int nToReturn , int nToSkip ,
+ const BSONObj *fieldsToReturn , int queryOptions ) {
+
+ //if ( ! query.obj.isEmpty() || nToReturn != 0 || nToSkip != 0 || fieldsToReturn || queryOptions )
+ return DBClientBase::query( ns , query , nToReturn , nToSkip , fieldsToReturn , queryOptions );
+ //
+ //assert( query.obj.isEmpty() );
+ //throw UserException( (string)"yay:" + ns );
+ }
+
+ void DBDirectClient::killCursor( long long id ) {
+ ClientCursor::erase( id );
+ }
+
+ HostAndPort DBDirectClient::_clientHost = HostAndPort( "0.0.0.0" , 0 );
+
+ unsigned long long DBDirectClient::count(const string &ns, const BSONObj& query, int options, int limit, int skip ) {
+ LockCollectionForReading lk( ns );
+ string errmsg;
+ long long res = runCount( ns.c_str() , _countCmd( ns , query , options , limit , skip ) , errmsg );
+ if ( res == -1 )
+ return 0;
+ uassert( 13637 , str::stream() << "count failed in DBDirectClient: " << errmsg , res >= 0 );
+ return (unsigned long long )res;
+ }
+
+ DBClientBase * createDirectClient() {
+ return new DBDirectClient();
+ }
+
+ mongo::mutex exitMutex("exit");
+ AtomicUInt numExitCalls = 0;
+
+ bool inShutdown() {
+ return numExitCalls > 0;
+ }
+
+ void tryToOutputFatal( const string& s ) {
+ try {
+ rawOut( s );
+ return;
+ }
+ catch ( ... ) {}
+
+ try {
+ cerr << s << endl;
+ return;
+ }
+ catch ( ... ) {}
+
+ // uh - oh, not sure there is anything else we can do...
+ }
+
+ /** also called by ntservice.cpp */
+ void shutdownServer() {
+
+ log() << "shutdown: going to close listening sockets..." << endl;
+ ListeningSockets::get()->closeAll();
+
+ log() << "shutdown: going to flush diaglog..." << endl;
+ _diaglog.flush();
+
+ /* must do this before unmapping mem or you may get a seg fault */
+ log() << "shutdown: going to close sockets..." << endl;
+ boost::thread close_socket_thread( boost::bind(MessagingPort::closeAllSockets, 0) );
+
+ // wait until file preallocation finishes
+ // we would only hang here if the file_allocator code generates a
+ // synchronous signal, which we don't expect
+ log() << "shutdown: waiting for fs preallocator..." << endl;
+ FileAllocator::get()->waitUntilFinished();
+
+ if( cmdLine.dur ) {
+ log() << "shutdown: lock for final commit..." << endl;
+ {
+ int n = 10;
+ while( 1 ) {
+ // we may already be in a read lock from earlier in the call stack, so do read lock here
+ // to be consistent with that.
+ readlocktry w("", 20000);
+ if( w.got() ) {
+ log() << "shutdown: final commit..." << endl;
+ getDur().commitNow();
+ break;
+ }
+ if( --n <= 0 ) {
+ log() << "shutdown: couldn't acquire write lock, aborting" << endl;
+ mongoAbort("couldn't acquire write lock");
+ }
+ log() << "shutdown: waiting for write lock..." << endl;
+ }
+ }
+ MemoryMappedFile::flushAll(true);
+ }
+
+ log() << "shutdown: closing all files..." << endl;
+ stringstream ss3;
+ MemoryMappedFile::closeAllFiles( ss3 );
+ log() << ss3.str() << endl;
+
+ if( cmdLine.dur ) {
+ dur::journalCleanup(true);
+ }
+
+#if !defined(__sunos__)
+ if ( lockFile ) {
+ log() << "shutdown: removing fs lock..." << endl;
+ /* This ought to be an unlink(), but Eliot says the last
+ time that was attempted, there was a race condition
+ with acquirePathLock(). */
+#ifdef _WIN32
+ if( _chsize( lockFile , 0 ) )
+ log() << "couldn't remove fs lock " << WSAGetLastError() << endl;
+ CloseHandle(lockFileHandle);
+#else
+ if( ftruncate( lockFile , 0 ) )
+ log() << "couldn't remove fs lock " << errnoWithDescription() << endl;
+ flock( lockFile, LOCK_UN );
+#endif
+ }
+#endif
+ }
+
+ void exitCleanly( ExitCode code ) {
+ killCurrentOp.killAll();
+ {
+ dblock lk;
+ log() << "now exiting" << endl;
+ dbexit( code );
+ }
+ }
+
+
+ namespace dur {
+ extern mutex groupCommitMutex;
+ }
+
+ /* not using log() herein in case we are already locked */
+ NOINLINE_DECL void dbexit( ExitCode rc, const char *why, bool tryToGetLock ) {
+
+ auto_ptr<writelocktry> wlt;
+ if ( tryToGetLock ) {
+ wlt.reset( new writelocktry( "" , 2 * 60 * 1000 ) );
+ uassert( 13455 , "dbexit timed out getting lock" , wlt->got() );
+ }
+
+ Client * c = currentClient.get();
+ {
+ scoped_lock lk( exitMutex );
+ if ( numExitCalls++ > 0 ) {
+ if ( numExitCalls > 5 ) {
+ // this means something horrible has happened
+ ::_exit( rc );
+ }
+ stringstream ss;
+ ss << "dbexit: " << why << "; exiting immediately";
+ tryToOutputFatal( ss.str() );
+ if ( c ) c->shutdown();
+ ::exit( rc );
+ }
+ }
+
+ {
+ stringstream ss;
+ ss << "dbexit: " << why;
+ tryToOutputFatal( ss.str() );
+ }
+
+ try {
+ shutdownServer(); // gracefully shutdown instance
+ }
+ catch ( ... ) {
+ tryToOutputFatal( "shutdown failed with exception" );
+ }
+
+#if defined(_DEBUG)
+ try {
+ mutexDebugger.programEnding();
+ }
+ catch (...) { }
+#endif
+
+ // block the dur thread from doing any work for the rest of the run
+ log(2) << "shutdown: groupCommitMutex" << endl;
+ scoped_lock lk(dur::groupCommitMutex);
+
+#ifdef _WIN32
+ // Windows Service Controller wants to be told when we are down,
+ // so don't call ::exit() yet, or say "really exiting now"
+ //
+ if ( rc == EXIT_WINDOWS_SERVICE_STOP ) {
+ if ( c ) c->shutdown();
+ return;
+ }
+#endif
+ tryToOutputFatal( "dbexit: really exiting now" );
+ if ( c ) c->shutdown();
+ ::exit(rc);
+ }
+
+#if !defined(__sunos__)
+ void writePid(int fd) {
+ stringstream ss;
+ ss << getpid() << endl;
+ string s = ss.str();
+ const char * data = s.c_str();
+#ifdef _WIN32
+ assert ( _write( fd, data, strlen( data ) ) );
+#else
+ assert ( write( fd, data, strlen( data ) ) );
+#endif
+ }
+
+ void acquirePathLock(bool doingRepair) {
+ string name = ( boost::filesystem::path( dbpath ) / "mongod.lock" ).native_file_string();
+
+ bool oldFile = false;
+
+ if ( boost::filesystem::exists( name ) && boost::filesystem::file_size( name ) > 0 ) {
+ oldFile = true;
+ }
+
+#ifdef _WIN32
+ lockFileHandle = CreateFileA( name.c_str(), GENERIC_READ | GENERIC_WRITE,
+ 0 /* do not allow anyone else access */, NULL,
+ OPEN_ALWAYS /* success if fh can open */, 0, NULL );
+
+ if (lockFileHandle == INVALID_HANDLE_VALUE) {
+ DWORD code = GetLastError();
+ char *msg;
+ FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM,
+ NULL, code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+ (LPSTR)&msg, 0, NULL);
+ string m = msg;
+ str::stripTrailing(m, "\r\n");
+ uasserted( 13627 , str::stream() << "Unable to create/open lock file: " << name << ' ' << m << ". Is a mongod instance already running?" );
+ }
+ lockFile = _open_osfhandle((intptr_t)lockFileHandle, 0);
+#else
+ lockFile = open( name.c_str(), O_RDWR | O_CREAT , S_IRWXU | S_IRWXG | S_IRWXO );
+ if( lockFile <= 0 ) {
+ uasserted( 10309 , str::stream() << "Unable to create/open lock file: " << name << ' ' << errnoWithDescription() << " Is a mongod instance already running?" );
+ }
+ if (flock( lockFile, LOCK_EX | LOCK_NB ) != 0) {
+ close ( lockFile );
+ lockFile = 0;
+ uassert( 10310 , "Unable to lock file: " + name + ". Is a mongod instance already running?", 0 );
+ }
+#endif
+
+ if ( oldFile ) {
+ // we check this here because we want to see if we can get the lock
+ // if we can't, then its probably just another mongod running
+
+ string errmsg;
+ if (cmdLine.dur) {
+ if (!dur::haveJournalFiles()) {
+
+ vector<string> dbnames;
+ getDatabaseNames( dbnames );
+
+ if ( dbnames.size() == 0 ) {
+ // this means that mongod crashed
+ // between initial startup and when journaling was initialized
+ // it is safe to continue
+ }
+ else {
+ errmsg = str::stream()
+ << "************** \n"
+ << "old lock file: " << name << ". probably means unclean shutdown,\n"
+ << "but there are no journal files to recover.\n"
+ << "this is likely human error or filesystem corruption.\n"
+ << "found " << dbnames.size() << " dbs.\n"
+ << "see: http://dochub.mongodb.org/core/repair for more information\n"
+ << "*************";
+ }
+
+
+ }
+ }
+ else {
+ if (!dur::haveJournalFiles() && !doingRepair) {
+ errmsg = str::stream()
+ << "************** \n"
+ << "Unclean shutdown detected.\n"
+ << "Please visit http://dochub.mongodb.org/core/repair for recovery instructions.\n"
+ << "*************";
+ }
+ }
+
+ if (!errmsg.empty()) {
+ cout << errmsg << endl;
+#ifdef _WIN32
+ CloseHandle( lockFileHandle );
+#else
+ close ( lockFile );
+#endif
+ lockFile = 0;
+ uassert( 12596 , "old lock file" , 0 );
+ }
+ }
+
+ // Not related to lock file, but this is where we handle unclean shutdown
+ if( !cmdLine.dur && dur::haveJournalFiles() ) {
+ cout << "**************" << endl;
+ cout << "Error: journal files are present in journal directory, yet starting without journaling enabled." << endl;
+ cout << "It is recommended that you start with journaling enabled so that recovery may occur." << endl;
+ cout << "**************" << endl;
+ uasserted(13597, "can't start without --journal enabled when journal/ files are present");
+ }
+
+#ifdef _WIN32
+ uassert( 13625, "Unable to truncate lock file", _chsize(lockFile, 0) == 0);
+ writePid( lockFile );
+ _commit( lockFile );
+#else
+ uassert( 13342, "Unable to truncate lock file", ftruncate(lockFile, 0) == 0);
+ writePid( lockFile );
+ fsync( lockFile );
+ flushMyDirectory(name);
+#endif
+ }
+#else
+ void acquirePathLock(bool) {
+ // TODO - this is very bad that the code above not running here.
+
+ // Not related to lock file, but this is where we handle unclean shutdown
+ if( !cmdLine.dur && dur::haveJournalFiles() ) {
+ cout << "**************" << endl;
+ cout << "Error: journal files are present in journal directory, yet starting without --journal enabled." << endl;
+ cout << "It is recommended that you start with journaling enabled so that recovery may occur." << endl;
+ cout << "Alternatively (not recommended), you can backup everything, then delete the journal files, and run --repair" << endl;
+ cout << "**************" << endl;
+ uasserted(13618, "can't start without --journal enabled when journal/ files are present");
+ }
+ }
+#endif
+
+} // namespace mongo
diff --git a/src/mongo/db/instance.h b/src/mongo/db/instance.h
new file mode 100644
index 00000000000..9dde729997d
--- /dev/null
+++ b/src/mongo/db/instance.h
@@ -0,0 +1,174 @@
+// instance.h : Global state functions.
+//
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+
+#include "../client/dbclient.h"
+#include "curop-inl.h"
+#include "security.h"
+#include "cmdline.h"
+#include "client.h"
+
+namespace mongo {
+
+ extern string dbExecCommand;
+
+ /** a high level recording of operations to the database - sometimes used for diagnostics
+ and debugging.
+ */
+ class DiagLog {
+ ofstream *f; // note this is never freed
+ /* 0 = off; 1 = writes, 2 = reads, 3 = both
+ 7 = log a few reads, and all writes.
+ */
+ int level;
+ mongo::mutex mutex;
+ void openFile() {
+ assert( f == 0 );
+ stringstream ss;
+ ss << dbpath << "/diaglog." << hex << time(0);
+ string name = ss.str();
+ f = new ofstream(name.c_str(), ios::out | ios::binary);
+ if ( ! f->good() ) {
+ problem() << "diagLogging couldn't open " << name << endl;
+ // todo what is this? :
+ throw 1717;
+ }
+ else {
+ log() << "diagLogging using file " << name << endl;
+ }
+ }
+ public:
+ DiagLog() : f(0) , level(0), mutex("DiagLog") { }
+ int getLevel() const { return level; }
+ /**
+ * @return old
+ */
+ int setLevel( int newLevel ) {
+ scoped_lock lk(mutex);
+ int old = level;
+ log() << "diagLogging level=" << newLevel << endl;
+ if( f == 0 ) {
+ openFile();
+ }
+ level = newLevel; // must be done AFTER f is set
+ return old;
+ }
+ void flush() {
+ if ( level ) {
+ log() << "flushing diag log" << endl;
+ scoped_lock lk(mutex);
+ f->flush();
+ }
+ }
+ void write(char *data,int len) {
+ if ( level & 1 ) {
+ scoped_lock lk(mutex);
+ f->write(data,len);
+ }
+ }
+ void readop(char *data, int len) {
+ if ( level & 2 ) {
+ bool log = (level & 4) == 0;
+ OCCASIONALLY log = true;
+ if ( log ) {
+ scoped_lock lk(mutex);
+ assert( f );
+ f->write(data,len);
+ }
+ }
+ }
+ };
+
+ extern DiagLog _diaglog;
+
+ /* we defer response until we unlock. don't want a blocked socket to
+ keep things locked.
+ */
+ struct DbResponse {
+ Message *response;
+ MSGID responseTo;
+ const char *exhaust; /* points to ns if exhaust mode. 0=normal mode*/
+ DbResponse(Message *r, MSGID rt) : response(r), responseTo(rt), exhaust(0) { }
+ DbResponse() {
+ response = 0;
+ exhaust = 0;
+ }
+ ~DbResponse() { delete response; }
+ };
+
+ void assembleResponse( Message &m, DbResponse &dbresponse, const HostAndPort &client );
+
+ void getDatabaseNames( vector< string > &names , const string& usePath = dbpath );
+
+ /* returns true if there is no data on this server. useful when starting replication.
+ local database does NOT count.
+ */
+ bool replHasDatabases();
+
+ /** "embedded" calls to the local server directly.
+ Caller does not need to lock, that is handled within.
+ */
+ class DBDirectClient : public DBClientBase {
+ public:
+ virtual auto_ptr<DBClientCursor> query(const string &ns, Query query, int nToReturn = 0, int nToSkip = 0,
+ const BSONObj *fieldsToReturn = 0, int queryOptions = 0);
+
+ virtual bool isFailed() const {
+ return false;
+ }
+ virtual string toString() {
+ return "DBDirectClient";
+ }
+ virtual string getServerAddress() const {
+ return "localhost"; // TODO: should this have the port?
+ }
+ virtual bool call( Message &toSend, Message &response, bool assertOk=true , string * actualServer = 0 );
+ virtual void say( Message &toSend, bool isRetry = false );
+ virtual void sayPiggyBack( Message &toSend ) {
+ // don't need to piggy back when connected locally
+ return say( toSend );
+ }
+
+ virtual void killCursor( long long cursorID );
+
+ virtual bool callRead( Message& toSend , Message& response ) {
+ return call( toSend , response );
+ }
+
+ virtual unsigned long long count(const string &ns, const BSONObj& query = BSONObj(), int options=0, int limit=0, int skip=0 );
+
+ virtual ConnectionString::ConnectionType type() const { return ConnectionString::MASTER; }
+
+ double getSoTimeout() const { return 0; }
+
+ virtual bool lazySupported() const { return true; }
+ private:
+ static HostAndPort _clientHost;
+ };
+
+ extern int lockFile;
+#ifdef _WIN32
+ extern HANDLE lockFileHandle;
+#endif
+ void acquirePathLock(bool doingRepair=false); // if doingRepair=true don't consider unclean shutdown an error
+ void maybeCreatePidFile();
+
+} // namespace mongo
diff --git a/src/mongo/db/introspect.cpp b/src/mongo/db/introspect.cpp
new file mode 100644
index 00000000000..7e1d19ce2f3
--- /dev/null
+++ b/src/mongo/db/introspect.cpp
@@ -0,0 +1,88 @@
+// introspect.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "introspect.h"
+#include "../bson/util/builder.h"
+#include "../util/goodies.h"
+#include "pdfile.h"
+#include "jsobj.h"
+#include "pdfile.h"
+#include "curop.h"
+
+namespace mongo {
+
+ BufBuilder profileBufBuilder; // reused, instead of allocated every time - avoids a malloc/free cycle
+
+ void profile( const Client& c , CurOp& currentOp ) {
+ assertInWriteLock();
+
+ Database *db = c.database();
+ DEV assert( db );
+ const char *ns = db->profileName.c_str();
+
+ // build object
+ profileBufBuilder.reset();
+ BSONObjBuilder b(profileBufBuilder);
+ b.appendDate("ts", jsTime());
+ currentOp.debug().append( currentOp , b );
+
+ b.append("client", c.clientAddress() );
+
+ if ( c.getAuthenticationInfo() )
+ b.append( "user" , c.getAuthenticationInfo()->getUser( nsToDatabase( ns ) ) );
+
+ BSONObj p = b.done();
+
+ if (p.objsize() > 100*1024){
+ string small = p.toString(/*isArray*/false, /*full*/false);
+
+ warning() << "can't add full line to system.profile: " << small;
+
+ // rebuild with limited info
+ BSONObjBuilder b(profileBufBuilder);
+ b.appendDate("ts", jsTime());
+ b.append("client", c.clientAddress() );
+ if ( c.getAuthenticationInfo() )
+ b.append( "user" , c.getAuthenticationInfo()->getUser( nsToDatabase( ns ) ) );
+
+ b.append("err", "profile line too large (max is 100KB)");
+ if (small.size() < 100*1024){ // should be much smaller but if not don't break anything
+ b.append("abbreviated", small);
+ }
+
+ p = b.done();
+ }
+
+ // write: not replicated
+ NamespaceDetails *d = db->namespaceIndex.details(ns);
+ if( d ) {
+ int len = p.objsize();
+ Record *r = theDataFileMgr.fast_oplog_insert(d, ns, len);
+ memcpy(getDur().writingPtr(r->data, len), p.objdata(), len);
+ }
+ else {
+ static time_t last;
+ if( time(0) > last+10 ) {
+ log() << "profile: warning ns " << ns << " does not exist" << endl;
+ last = time(0);
+ }
+ }
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/introspect.h b/src/mongo/db/introspect.h
new file mode 100644
index 00000000000..209eeacab7c
--- /dev/null
+++ b/src/mongo/db/introspect.h
@@ -0,0 +1,34 @@
+// introspect.h
+// system management stuff.
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "jsobj.h"
+#include "pdfile.h"
+
+namespace mongo {
+
+ /* --- profiling --------------------------------------------
+ do when database->profile is set
+ */
+
+ void profile( const Client& c , CurOp& currentOp );
+
+} // namespace mongo
diff --git a/src/mongo/db/javatest.cpp b/src/mongo/db/javatest.cpp
new file mode 100644
index 00000000000..22f2bdf8d3c
--- /dev/null
+++ b/src/mongo/db/javatest.cpp
@@ -0,0 +1,24 @@
+// javatest.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "javajs.h"
+
+int main() {
+ JavaJS = new JavaJSImpl();
+ javajstest();
+}
diff --git a/src/mongo/db/jsobj.cpp b/src/mongo/db/jsobj.cpp
new file mode 100644
index 00000000000..1e850982396
--- /dev/null
+++ b/src/mongo/db/jsobj.cpp
@@ -0,0 +1,1268 @@
+/** @file jsobj.cpp - BSON implementation
+ http://www.mongodb.org/display/DOCS/BSON
+*/
+
+/* Copyright 2009 10gen Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pch.h"
+#include "../bson/oid.h"
+#include "jsobj.h"
+#include "nonce.h"
+#include "../bson/util/atomic_int.h"
+#include "../util/base64.h"
+#include "../util/md5.hpp"
+#include <limits>
+#include <cmath>
+#include "../util/unittest.h"
+#include "../util/embedded_builder.h"
+#include "../util/stringutils.h"
+#include "../util/mongoutils/str.h"
+#include "json.h"
+#include "jsobjmanipulator.h"
+#include "../util/optime.h"
+#include <boost/static_assert.hpp>
+#undef assert
+#define assert MONGO_assert
+
+// make sure our assumptions are valid
+BOOST_STATIC_ASSERT( sizeof(short) == 2 );
+BOOST_STATIC_ASSERT( sizeof(int) == 4 );
+BOOST_STATIC_ASSERT( sizeof(long long) == 8 );
+BOOST_STATIC_ASSERT( sizeof(double) == 8 );
+BOOST_STATIC_ASSERT( sizeof(mongo::Date_t) == 8 );
+BOOST_STATIC_ASSERT( sizeof(mongo::OID) == 12 );
+
+namespace mongo {
+
+ BSONElement eooElement;
+
+ GENOIDLabeler GENOID;
+
+ DateNowLabeler DATENOW;
+ NullLabeler BSONNULL;
+
+ MinKeyLabeler MINKEY;
+ MaxKeyLabeler MAXKEY;
+
+ // need to move to bson/, but has dependency on base64 so move that to bson/util/ first.
+ inline string BSONElement::jsonString( JsonStringFormat format, bool includeFieldNames, int pretty ) const {
+ BSONType t = type();
+ int sign;
+ if ( t == Undefined )
+ return "undefined";
+
+ stringstream s;
+ if ( includeFieldNames )
+ s << '"' << escape( fieldName() ) << "\" : ";
+ switch ( type() ) {
+ case mongo::String:
+ case Symbol:
+ s << '"' << escape( string(valuestr(), valuestrsize()-1) ) << '"';
+ break;
+ case NumberLong:
+ s << _numberLong();
+ break;
+ case NumberInt:
+ case NumberDouble:
+ if ( number() >= -numeric_limits< double >::max() &&
+ number() <= numeric_limits< double >::max() ) {
+ s.precision( 16 );
+ s << number();
+ }
+ else if ( mongo::isNaN(number()) ) {
+ s << "NaN";
+ }
+ else if ( mongo::isInf(number(), &sign) ) {
+ s << ( sign == 1 ? "Infinity" : "-Infinity");
+ }
+ else {
+ StringBuilder ss;
+ ss << "Number " << number() << " cannot be represented in JSON";
+ string message = ss.str();
+ massert( 10311 , message.c_str(), false );
+ }
+ break;
+ case mongo::Bool:
+ s << ( boolean() ? "true" : "false" );
+ break;
+ case jstNULL:
+ s << "null";
+ break;
+ case Object:
+ s << embeddedObject().jsonString( format, pretty );
+ break;
+ case mongo::Array: {
+ if ( embeddedObject().isEmpty() ) {
+ s << "[]";
+ break;
+ }
+ s << "[ ";
+ BSONObjIterator i( embeddedObject() );
+ BSONElement e = i.next();
+ if ( !e.eoo() ) {
+ int count = 0;
+ while ( 1 ) {
+ if( pretty ) {
+ s << '\n';
+ for( int x = 0; x < pretty; x++ )
+ s << " ";
+ }
+
+ if (strtol(e.fieldName(), 0, 10) > count) {
+ s << "undefined";
+ }
+ else {
+ s << e.jsonString( format, false, pretty?pretty+1:0 );
+ e = i.next();
+ }
+ count++;
+ if ( e.eoo() )
+ break;
+ s << ", ";
+ }
+ }
+ s << " ]";
+ break;
+ }
+ case DBRef: {
+ mongo::OID *x = (mongo::OID *) (valuestr() + valuestrsize());
+ if ( format == TenGen )
+ s << "Dbref( ";
+ else
+ s << "{ \"$ref\" : ";
+ s << '"' << valuestr() << "\", ";
+ if ( format != TenGen )
+ s << "\"$id\" : ";
+ s << '"' << *x << "\" ";
+ if ( format == TenGen )
+ s << ')';
+ else
+ s << '}';
+ break;
+ }
+ case jstOID:
+ if ( format == TenGen ) {
+ s << "ObjectId( ";
+ }
+ else {
+ s << "{ \"$oid\" : ";
+ }
+ s << '"' << __oid() << '"';
+ if ( format == TenGen ) {
+ s << " )";
+ }
+ else {
+ s << " }";
+ }
+ break;
+ case BinData: {
+ int len = *(int *)( value() );
+ BinDataType type = BinDataType( *(char *)( (int *)( value() ) + 1 ) );
+ s << "{ \"$binary\" : \"";
+ char *start = ( char * )( value() ) + sizeof( int ) + 1;
+ base64::encode( s , start , len );
+ s << "\", \"$type\" : \"" << hex;
+ s.width( 2 );
+ s.fill( '0' );
+ s << type << dec;
+ s << "\" }";
+ break;
+ }
+ case mongo::Date:
+ if ( format == Strict )
+ s << "{ \"$date\" : ";
+ else
+ s << "Date( ";
+ if( pretty ) {
+ Date_t d = date();
+ if( d == 0 ) s << '0';
+ else
+ s << '"' << date().toString() << '"';
+ }
+ else
+ s << date();
+ if ( format == Strict )
+ s << " }";
+ else
+ s << " )";
+ break;
+ case RegEx:
+ if ( format == Strict ) {
+ s << "{ \"$regex\" : \"" << escape( regex() );
+ s << "\", \"$options\" : \"" << regexFlags() << "\" }";
+ }
+ else {
+ s << "/" << escape( regex() , true ) << "/";
+ // FIXME Worry about alpha order?
+ for ( const char *f = regexFlags(); *f; ++f ) {
+ switch ( *f ) {
+ case 'g':
+ case 'i':
+ case 'm':
+ s << *f;
+ default:
+ break;
+ }
+ }
+ }
+ break;
+
+ case CodeWScope: {
+ BSONObj scope = codeWScopeObject();
+ if ( ! scope.isEmpty() ) {
+ s << "{ \"$code\" : " << _asCode() << " , "
+ << " \"$scope\" : " << scope.jsonString() << " }";
+ break;
+ }
+ }
+
+ case Code:
+ s << _asCode();
+ break;
+
+ case Timestamp:
+ s << "{ \"t\" : " << timestampTime() << " , \"i\" : " << timestampInc() << " }";
+ break;
+
+ case MinKey:
+ s << "{ \"$minKey\" : 1 }";
+ break;
+
+ case MaxKey:
+ s << "{ \"$maxKey\" : 1 }";
+ break;
+
+ default:
+ StringBuilder ss;
+ ss << "Cannot create a properly formatted JSON string with "
+ << "element: " << toString() << " of type: " << type();
+ string message = ss.str();
+ massert( 10312 , message.c_str(), false );
+ }
+ return s.str();
+ }
+
+ int BSONElement::getGtLtOp( int def ) const {
+ const char *fn = fieldName();
+ if ( fn[0] == '$' && fn[1] ) {
+ if ( fn[2] == 't' ) {
+ if ( fn[1] == 'g' ) {
+ if ( fn[3] == 0 ) return BSONObj::GT;
+ else if ( fn[3] == 'e' && fn[4] == 0 ) return BSONObj::GTE;
+ }
+ else if ( fn[1] == 'l' ) {
+ if ( fn[3] == 0 ) return BSONObj::LT;
+ else if ( fn[3] == 'e' && fn[4] == 0 ) return BSONObj::LTE;
+ }
+ }
+ else if ( fn[1] == 'n' && fn[2] == 'e' ) {
+ if ( fn[3] == 0 )
+ return BSONObj::NE;
+ if ( fn[3] == 'a' && fn[4] == 'r') // matches anything with $near prefix
+ return BSONObj::opNEAR;
+ }
+ else if ( fn[1] == 'm' ) {
+ if ( fn[2] == 'o' && fn[3] == 'd' && fn[4] == 0 )
+ return BSONObj::opMOD;
+ if ( fn[2] == 'a' && fn[3] == 'x' && fn[4] == 'D' && fn[5] == 'i' && fn[6] == 's' && fn[7] == 't' && fn[8] == 'a' && fn[9] == 'n' && fn[10] == 'c' && fn[11] == 'e' && fn[12] == 0 )
+ return BSONObj::opMAX_DISTANCE;
+ }
+ else if ( fn[1] == 't' && fn[2] == 'y' && fn[3] == 'p' && fn[4] == 'e' && fn[5] == 0 )
+ return BSONObj::opTYPE;
+ else if ( fn[1] == 'i' && fn[2] == 'n' && fn[3] == 0 )
+ return BSONObj::opIN;
+ else if ( fn[1] == 'n' && fn[2] == 'i' && fn[3] == 'n' && fn[4] == 0 )
+ return BSONObj::NIN;
+ else if ( fn[1] == 'a' && fn[2] == 'l' && fn[3] == 'l' && fn[4] == 0 )
+ return BSONObj::opALL;
+ else if ( fn[1] == 's' && fn[2] == 'i' && fn[3] == 'z' && fn[4] == 'e' && fn[5] == 0 )
+ return BSONObj::opSIZE;
+ else if ( fn[1] == 'e' ) {
+ if ( fn[2] == 'x' && fn[3] == 'i' && fn[4] == 's' && fn[5] == 't' && fn[6] == 's' && fn[7] == 0 )
+ return BSONObj::opEXISTS;
+ if ( fn[2] == 'l' && fn[3] == 'e' && fn[4] == 'm' && fn[5] == 'M' && fn[6] == 'a' && fn[7] == 't' && fn[8] == 'c' && fn[9] == 'h' && fn[10] == 0 )
+ return BSONObj::opELEM_MATCH;
+ }
+ else if ( fn[1] == 'r' && fn[2] == 'e' && fn[3] == 'g' && fn[4] == 'e' && fn[5] == 'x' && fn[6] == 0 )
+ return BSONObj::opREGEX;
+ else if ( fn[1] == 'o' && fn[2] == 'p' && fn[3] == 't' && fn[4] == 'i' && fn[5] == 'o' && fn[6] == 'n' && fn[7] == 's' && fn[8] == 0 )
+ return BSONObj::opOPTIONS;
+ else if ( fn[1] == 'w' && fn[2] == 'i' && fn[3] == 't' && fn[4] == 'h' && fn[5] == 'i' && fn[6] == 'n' && fn[7] == 0 )
+ return BSONObj::opWITHIN;
+ }
+ return def;
+ }
+
+ /* Matcher --------------------------------------*/
+
+// If the element is something like:
+// a : { $gt : 3 }
+// we append
+// a : 3
+// else we just append the element.
+//
+ void appendElementHandlingGtLt(BSONObjBuilder& b, const BSONElement& e) {
+ if ( e.type() == Object ) {
+ BSONElement fe = e.embeddedObject().firstElement();
+ const char *fn = fe.fieldName();
+ if ( fn[0] == '$' && fn[1] && fn[2] == 't' ) {
+ b.appendAs(fe, e.fieldName());
+ return;
+ }
+ }
+ b.append(e);
+ }
+
+ int getGtLtOp(const BSONElement& e) {
+ if ( e.type() != Object )
+ return BSONObj::Equality;
+
+ BSONElement fe = e.embeddedObject().firstElement();
+ return fe.getGtLtOp();
+ }
+
+ FieldCompareResult compareDottedFieldNames( const string& l , const string& r ) {
+ static int maxLoops = 1024 * 1024;
+
+ size_t lstart = 0;
+ size_t rstart = 0;
+
+ for ( int i=0; i<maxLoops; i++ ) {
+
+ size_t a = l.find( '.' , lstart );
+ size_t b = r.find( '.' , rstart );
+
+ size_t lend = a == string::npos ? l.size() : a;
+ size_t rend = b == string::npos ? r.size() : b;
+
+ const string& c = l.substr( lstart , lend - lstart );
+ const string& d = r.substr( rstart , rend - rstart );
+
+ int x = lexNumCmp( c.c_str(), d.c_str() );
+
+ if ( x < 0 )
+ return LEFT_BEFORE;
+ if ( x > 0 )
+ return RIGHT_BEFORE;
+
+ lstart = lend + 1;
+ rstart = rend + 1;
+
+ if ( lstart >= l.size() ) {
+ if ( rstart >= r.size() )
+ return SAME;
+ return RIGHT_SUBFIELD;
+ }
+ if ( rstart >= r.size() )
+ return LEFT_SUBFIELD;
+ }
+
+ log() << "compareDottedFieldNames ERROR l: " << l << " r: " << r << " TOO MANY LOOPS" << endl;
+ assert(0);
+ return SAME; // will never get here
+ }
+
+ /* BSONObj ------------------------------------------------------------*/
+
+ string BSONObj::md5() const {
+ md5digest d;
+ md5_state_t st;
+ md5_init(&st);
+ md5_append( &st , (const md5_byte_t*)_objdata , objsize() );
+ md5_finish(&st, d);
+ return digestToString( d );
+ }
+
+ string BSONObj::jsonString( JsonStringFormat format, int pretty ) const {
+
+ if ( isEmpty() ) return "{}";
+
+ StringBuilder s;
+ s << "{ ";
+ BSONObjIterator i(*this);
+ BSONElement e = i.next();
+ if ( !e.eoo() )
+ while ( 1 ) {
+ s << e.jsonString( format, true, pretty?pretty+1:0 );
+ e = i.next();
+ if ( e.eoo() )
+ break;
+ s << ",";
+ if ( pretty ) {
+ s << '\n';
+ for( int x = 0; x < pretty; x++ )
+ s << " ";
+ }
+ else {
+ s << " ";
+ }
+ }
+ s << " }";
+ return s.str();
+ }
+
+ bool BSONObj::valid() const {
+ try {
+ BSONObjIterator it(*this);
+ while( it.moreWithEOO() ) {
+ // both throw exception on failure
+ BSONElement e = it.next(true);
+ e.validate();
+
+ if (e.eoo()) {
+ if (it.moreWithEOO())
+ return false;
+ return true;
+ }
+ else if (e.isABSONObj()) {
+ if(!e.embeddedObject().valid())
+ return false;
+ }
+ else if (e.type() == CodeWScope) {
+ if(!e.codeWScopeObject().valid())
+ return false;
+ }
+ }
+ }
+ catch (...) {
+ }
+ return false;
+ }
+
+ int BSONObj::woCompare(const BSONObj& r, const Ordering &o, bool considerFieldName) const {
+ if ( isEmpty() )
+ return r.isEmpty() ? 0 : -1;
+ if ( r.isEmpty() )
+ return 1;
+
+ BSONObjIterator i(*this);
+ BSONObjIterator j(r);
+ unsigned mask = 1;
+ while ( 1 ) {
+ // so far, equal...
+
+ BSONElement l = i.next();
+ BSONElement r = j.next();
+ if ( l.eoo() )
+ return r.eoo() ? 0 : -1;
+ if ( r.eoo() )
+ return 1;
+
+ int x;
+ {
+ x = l.woCompare( r, considerFieldName );
+ if( o.descending(mask) )
+ x = -x;
+ }
+ if ( x != 0 )
+ return x;
+ mask <<= 1;
+ }
+ return -1;
+ }
+
+ /* well ordered compare */
+ int BSONObj::woCompare(const BSONObj &r, const BSONObj &idxKey,
+ bool considerFieldName) const {
+ if ( isEmpty() )
+ return r.isEmpty() ? 0 : -1;
+ if ( r.isEmpty() )
+ return 1;
+
+ bool ordered = !idxKey.isEmpty();
+
+ BSONObjIterator i(*this);
+ BSONObjIterator j(r);
+ BSONObjIterator k(idxKey);
+ while ( 1 ) {
+ // so far, equal...
+
+ BSONElement l = i.next();
+ BSONElement r = j.next();
+ BSONElement o;
+ if ( ordered )
+ o = k.next();
+ if ( l.eoo() )
+ return r.eoo() ? 0 : -1;
+ if ( r.eoo() )
+ return 1;
+
+ int x;
+ /*
+ if( ordered && o.type() == String && strcmp(o.valuestr(), "ascii-proto") == 0 &&
+ l.type() == String && r.type() == String ) {
+ // note: no negative support yet, as this is just sort of a POC
+ x = _stricmp(l.valuestr(), r.valuestr());
+ }
+ else*/ {
+ x = l.woCompare( r, considerFieldName );
+ if ( ordered && o.number() < 0 )
+ x = -x;
+ }
+ if ( x != 0 )
+ return x;
+ }
+ return -1;
+ }
+
+ BSONObj staticNull = fromjson( "{'':null}" );
+ BSONObj makeUndefined() {
+ BSONObjBuilder b;
+ b.appendUndefined( "" );
+ return b.obj();
+ }
+ BSONObj staticUndefined = makeUndefined();
+
+ /* well ordered compare */
+ int BSONObj::woSortOrder(const BSONObj& other, const BSONObj& sortKey , bool useDotted ) const {
+ if ( isEmpty() )
+ return other.isEmpty() ? 0 : -1;
+ if ( other.isEmpty() )
+ return 1;
+
+ uassert( 10060 , "woSortOrder needs a non-empty sortKey" , ! sortKey.isEmpty() );
+
+ BSONObjIterator i(sortKey);
+ while ( 1 ) {
+ BSONElement f = i.next();
+ if ( f.eoo() )
+ return 0;
+
+ BSONElement l = useDotted ? getFieldDotted( f.fieldName() ) : getField( f.fieldName() );
+ if ( l.eoo() )
+ l = staticNull.firstElement();
+ BSONElement r = useDotted ? other.getFieldDotted( f.fieldName() ) : other.getField( f.fieldName() );
+ if ( r.eoo() )
+ r = staticNull.firstElement();
+
+ int x = l.woCompare( r, false );
+ if ( f.number() < 0 )
+ x = -x;
+ if ( x != 0 )
+ return x;
+ }
+ return -1;
+ }
+
+ template <typename BSONElementColl>
+ void _getFieldsDotted( const BSONObj* obj, const StringData& name, BSONElementColl &ret, bool expandLastArray ) {
+ BSONElement e = obj->getField( name );
+
+ if ( e.eoo() ) {
+ const char *p = strchr(name.data(), '.');
+ if ( p ) {
+ string left(name.data(), p-name.data());
+ const char* next = p+1;
+ BSONElement e = obj->getField( left.c_str() );
+
+ if (e.type() == Object) {
+ e.embeddedObject().getFieldsDotted(next, ret, expandLastArray );
+ }
+ else if (e.type() == Array) {
+ bool allDigits = false;
+ if ( isdigit( *next ) ) {
+ const char * temp = next + 1;
+ while ( isdigit( *temp ) )
+ temp++;
+ allDigits = (*temp == '.' || *temp == '\0');
+ }
+ if (allDigits) {
+ e.embeddedObject().getFieldsDotted(next, ret, expandLastArray );
+ }
+ else {
+ BSONObjIterator i(e.embeddedObject());
+ while ( i.more() ) {
+ BSONElement e2 = i.next();
+ if (e2.type() == Object || e2.type() == Array)
+ e2.embeddedObject().getFieldsDotted(next, ret, expandLastArray );
+ }
+ }
+ }
+ else {
+ // do nothing: no match
+ }
+ }
+ }
+ else {
+ if (e.type() == Array && expandLastArray) {
+ BSONObjIterator i(e.embeddedObject());
+ while ( i.more() )
+ ret.insert(i.next());
+ }
+ else {
+ ret.insert(e);
+ }
+ }
+ }
+
+ void BSONObj::getFieldsDotted(const StringData& name, BSONElementSet &ret, bool expandLastArray ) const {
+ _getFieldsDotted( this, name, ret, expandLastArray );
+ }
+ void BSONObj::getFieldsDotted(const StringData& name, BSONElementMSet &ret, bool expandLastArray ) const {
+ _getFieldsDotted( this, name, ret, expandLastArray );
+ }
+
+ BSONElement BSONObj::getFieldDottedOrArray(const char *&name) const {
+ const char *p = strchr(name, '.');
+
+ BSONElement sub;
+
+ if ( p ) {
+ sub = getField( string(name, p-name) );
+ name = p + 1;
+ }
+ else {
+ sub = getField( name );
+ name = name + strlen(name);
+ }
+
+ if ( sub.eoo() )
+ return eooElement;
+ else if ( sub.type() == Array || name[0] == '\0' )
+ return sub;
+ else if ( sub.type() == Object )
+ return sub.embeddedObject().getFieldDottedOrArray( name );
+ else
+ return eooElement;
+ }
+
+ /**
+ sets element field names to empty string
+ If a field in pattern is missing, it is omitted from the returned
+ object.
+ */
+ BSONObj BSONObj::extractFieldsUnDotted(BSONObj pattern) const {
+ BSONObjBuilder b;
+ BSONObjIterator i(pattern);
+ while ( i.moreWithEOO() ) {
+ BSONElement e = i.next();
+ if ( e.eoo() )
+ break;
+ BSONElement x = getField(e.fieldName());
+ if ( !x.eoo() )
+ b.appendAs(x, "");
+ }
+ return b.obj();
+ }
+
+ BSONObj BSONObj::extractFields(const BSONObj& pattern , bool fillWithNull ) const {
+ BSONObjBuilder b(32); // scanandorder.h can make a zillion of these, so we start the allocation very small
+ BSONObjIterator i(pattern);
+ while ( i.moreWithEOO() ) {
+ BSONElement e = i.next();
+ if ( e.eoo() )
+ break;
+ BSONElement x = getFieldDotted(e.fieldName());
+ if ( ! x.eoo() )
+ b.appendAs( x, e.fieldName() );
+ else if ( fillWithNull )
+ b.appendNull( e.fieldName() );
+ }
+ return b.obj();
+ }
+
+ BSONObj BSONObj::filterFieldsUndotted( const BSONObj &filter, bool inFilter ) const {
+ BSONObjBuilder b;
+ BSONObjIterator i( *this );
+ while( i.moreWithEOO() ) {
+ BSONElement e = i.next();
+ if ( e.eoo() )
+ break;
+ BSONElement x = filter.getField( e.fieldName() );
+ if ( ( x.eoo() && !inFilter ) ||
+ ( !x.eoo() && inFilter ) )
+ b.append( e );
+ }
+ return b.obj();
+ }
+
+ BSONElement BSONObj::getFieldUsingIndexNames(const char *fieldName, const BSONObj &indexKey) const {
+ BSONObjIterator i( indexKey );
+ int j = 0;
+ while( i.moreWithEOO() ) {
+ BSONElement f = i.next();
+ if ( f.eoo() )
+ return BSONElement();
+ if ( strcmp( f.fieldName(), fieldName ) == 0 )
+ break;
+ ++j;
+ }
+ BSONObjIterator k( *this );
+ while( k.moreWithEOO() ) {
+ BSONElement g = k.next();
+ if ( g.eoo() )
+ return BSONElement();
+ if ( j == 0 ) {
+ return g;
+ }
+ --j;
+ }
+ return BSONElement();
+ }
+
+ /* grab names of all the fields in this object */
+ int BSONObj::getFieldNames(set<string>& fields) const {
+ int n = 0;
+ BSONObjIterator i(*this);
+ while ( i.moreWithEOO() ) {
+ BSONElement e = i.next();
+ if ( e.eoo() )
+ break;
+ fields.insert(e.fieldName());
+ n++;
+ }
+ return n;
+ }
+
+ /* note: addFields always adds _id even if not specified
+ returns n added not counting _id unless requested.
+ */
+ int BSONObj::addFields(BSONObj& from, set<string>& fields) {
+ assert( isEmpty() && !isOwned() ); /* partial implementation for now... */
+
+ BSONObjBuilder b;
+
+ int N = fields.size();
+ int n = 0;
+ BSONObjIterator i(from);
+ bool gotId = false;
+ while ( i.moreWithEOO() ) {
+ BSONElement e = i.next();
+ const char *fname = e.fieldName();
+ if ( fields.count(fname) ) {
+ b.append(e);
+ ++n;
+ gotId = gotId || strcmp(fname, "_id")==0;
+ if ( n == N && gotId )
+ break;
+ }
+ else if ( strcmp(fname, "_id")==0 ) {
+ b.append(e);
+ gotId = true;
+ if ( n == N && gotId )
+ break;
+ }
+ }
+
+ if ( n ) {
+ *this = b.obj();
+ }
+
+ return n;
+ }
+
+ bool BSONObj::couldBeArray() const {
+ BSONObjIterator i( *this );
+ int index = 0;
+ while( i.moreWithEOO() ){
+ BSONElement e = i.next();
+ if( e.eoo() ) break;
+
+ // TODO: If actually important, may be able to do int->char* much faster
+ if( strcmp( e.fieldName(), ((string)( mongoutils::str::stream() << index )).c_str() ) != 0 )
+ return false;
+ index++;
+ }
+ return true;
+ }
+
+ BSONObj BSONObj::clientReadable() const {
+ BSONObjBuilder b;
+ BSONObjIterator i( *this );
+ while( i.moreWithEOO() ) {
+ BSONElement e = i.next();
+ if ( e.eoo() )
+ break;
+ switch( e.type() ) {
+ case MinKey: {
+ BSONObjBuilder m;
+ m.append( "$minElement", 1 );
+ b.append( e.fieldName(), m.done() );
+ break;
+ }
+ case MaxKey: {
+ BSONObjBuilder m;
+ m.append( "$maxElement", 1 );
+ b.append( e.fieldName(), m.done() );
+ break;
+ }
+ default:
+ b.append( e );
+ }
+ }
+ return b.obj();
+ }
+
+ BSONObj BSONObj::replaceFieldNames( const BSONObj &names ) const {
+ BSONObjBuilder b;
+ BSONObjIterator i( *this );
+ BSONObjIterator j( names );
+ BSONElement f = j.moreWithEOO() ? j.next() : BSONObj().firstElement();
+ while( i.moreWithEOO() ) {
+ BSONElement e = i.next();
+ if ( e.eoo() )
+ break;
+ if ( !f.eoo() ) {
+ b.appendAs( e, f.fieldName() );
+ f = j.next();
+ }
+ else {
+ b.append( e );
+ }
+ }
+ return b.obj();
+ }
+
+ bool BSONObj::okForStorage() const {
+ BSONObjIterator i( *this );
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ const char * name = e.fieldName();
+
+ if ( strchr( name , '.' ) ||
+ strchr( name , '$' ) ) {
+ return
+ strcmp( name , "$ref" ) == 0 ||
+ strcmp( name , "$id" ) == 0
+ ;
+ }
+
+ if ( e.mayEncapsulate() ) {
+ switch ( e.type() ) {
+ case Object:
+ case Array:
+ if ( ! e.embeddedObject().okForStorage() )
+ return false;
+ break;
+ case CodeWScope:
+ if ( ! e.codeWScopeObject().okForStorage() )
+ return false;
+ break;
+ default:
+ uassert( 12579, "unhandled cases in BSONObj okForStorage" , 0 );
+ }
+
+ }
+ }
+ return true;
+ }
+
+ void BSONObj::dump() const {
+ out() << hex;
+ const char *p = objdata();
+ for ( int i = 0; i < objsize(); i++ ) {
+ out() << i << '\t' << ( 0xff & ( (unsigned) *p ) );
+ if ( *p >= 'A' && *p <= 'z' )
+ out() << '\t' << *p;
+ out() << endl;
+ p++;
+ }
+ }
+
+ void nested2dotted(BSONObjBuilder& b, const BSONObj& obj, const string& base) {
+ BSONObjIterator it(obj);
+ while (it.more()) {
+ BSONElement e = it.next();
+ if (e.type() == Object) {
+ string newbase = base + e.fieldName() + ".";
+ nested2dotted(b, e.embeddedObject(), newbase);
+ }
+ else {
+ string newbase = base + e.fieldName();
+ b.appendAs(e, newbase);
+ }
+ }
+ }
+
+ void dotted2nested(BSONObjBuilder& b, const BSONObj& obj) {
+ //use map to sort fields
+ BSONMap sorted = bson2map(obj);
+ EmbeddedBuilder eb(&b);
+ for(BSONMap::const_iterator it=sorted.begin(); it!=sorted.end(); ++it) {
+ eb.appendAs(it->second, it->first);
+ }
+ eb.done();
+ }
+
+ /*-- test things ----------------------------------------------------*/
+
+#pragma pack(1)
+ struct MaxKeyData {
+ MaxKeyData() {
+ totsize=7;
+ maxkey=MaxKey;
+ name=0;
+ eoo=EOO;
+ }
+ int totsize;
+ char maxkey;
+ char name;
+ char eoo;
+ } maxkeydata;
+ BSONObj maxKey((const char *) &maxkeydata);
+
+ struct MinKeyData {
+ MinKeyData() {
+ totsize=7;
+ minkey=MinKey;
+ name=0;
+ eoo=EOO;
+ }
+ int totsize;
+ char minkey;
+ char name;
+ char eoo;
+ } minkeydata;
+ BSONObj minKey((const char *) &minkeydata);
+
+ /*
+ struct JSObj0 {
+ JSObj0() {
+ totsize = 5;
+ eoo = EOO;
+ }
+ int totsize;
+ char eoo;
+ } js0;
+ */
+#pragma pack()
+
+ struct BsonUnitTest : public UnitTest {
+ void testRegex() {
+
+ BSONObjBuilder b;
+ b.appendRegex("x", "foo");
+ BSONObj o = b.done();
+
+ BSONObjBuilder c;
+ c.appendRegex("x", "goo");
+ BSONObj p = c.done();
+
+ assert( !o.binaryEqual( p ) );
+ assert( o.woCompare( p ) < 0 );
+
+ }
+ void testoid() {
+ OID id;
+ id.init();
+ // sleepsecs(3);
+
+ OID b;
+ // goes with sleep above...
+ // b.init();
+ // assert( memcmp(id.getData(), b.getData(), 12) < 0 );
+
+ b.init( id.str() );
+ assert( b == id );
+ }
+
+ void testbounds() {
+ BSONObj l , r;
+ {
+ BSONObjBuilder b;
+ b.append( "x" , numeric_limits<long long>::max() );
+ l = b.obj();
+ }
+ {
+ BSONObjBuilder b;
+ b.append( "x" , numeric_limits<double>::max() );
+ r = b.obj();
+ }
+ assert( l.woCompare( r ) < 0 );
+ assert( r.woCompare( l ) > 0 );
+ {
+ BSONObjBuilder b;
+ b.append( "x" , numeric_limits<int>::max() );
+ l = b.obj();
+ }
+ assert( l.woCompare( r ) < 0 );
+ assert( r.woCompare( l ) > 0 );
+ }
+
+ void testorder() {
+ {
+ BSONObj x,y,z;
+ { BSONObjBuilder b; b.append( "x" , (long long)2 ); x = b.obj(); }
+ { BSONObjBuilder b; b.append( "x" , (int)3 ); y = b.obj(); }
+ { BSONObjBuilder b; b.append( "x" , (long long)4 ); z = b.obj(); }
+ assert( x.woCompare( y ) < 0 );
+ assert( x.woCompare( z ) < 0 );
+ assert( y.woCompare( x ) > 0 );
+ assert( z.woCompare( x ) > 0 );
+ assert( y.woCompare( z ) < 0 );
+ assert( z.woCompare( y ) > 0 );
+ }
+
+ {
+ BSONObj ll,d,i,n,u;
+ { BSONObjBuilder b; b.append( "x" , (long long)2 ); ll = b.obj(); }
+ { BSONObjBuilder b; b.append( "x" , (double)2 ); d = b.obj(); }
+ { BSONObjBuilder b; b.append( "x" , (int)2 ); i = b.obj(); }
+ { BSONObjBuilder b; b.appendNull( "x" ); n = b.obj(); }
+ { BSONObjBuilder b; u = b.obj(); }
+
+ assert( ll.woCompare( u ) == d.woCompare( u ) );
+ assert( ll.woCompare( u ) == i.woCompare( u ) );
+ BSONObj k = BSON( "x" << 1 );
+ assert( ll.woCompare( u , k ) == d.woCompare( u , k ) );
+ assert( ll.woCompare( u , k ) == i.woCompare( u , k ) );
+
+ assert( u.woCompare( ll ) == u.woCompare( d ) );
+ assert( u.woCompare( ll ) == u.woCompare( i ) );
+ assert( u.woCompare( ll , k ) == u.woCompare( d , k ) );
+ assert( u.woCompare( ll , k ) == u.woCompare( d , k ) );
+
+ assert( i.woCompare( n ) == d.woCompare( n ) );
+
+ assert( ll.woCompare( n ) == d.woCompare( n ) );
+ assert( ll.woCompare( n ) == i.woCompare( n ) );
+ assert( ll.woCompare( n , k ) == d.woCompare( n , k ) );
+ assert( ll.woCompare( n , k ) == i.woCompare( n , k ) );
+
+ assert( n.woCompare( ll ) == n.woCompare( d ) );
+ assert( n.woCompare( ll ) == n.woCompare( i ) );
+ assert( n.woCompare( ll , k ) == n.woCompare( d , k ) );
+ assert( n.woCompare( ll , k ) == n.woCompare( d , k ) );
+ }
+
+ {
+ BSONObj l,r;
+ { BSONObjBuilder b; b.append( "x" , "eliot" ); l = b.obj(); }
+ { BSONObjBuilder b; b.appendSymbol( "x" , "eliot" ); r = b.obj(); }
+ assert( l.woCompare( r ) == 0 );
+ assert( r.woCompare( l ) == 0 );
+ }
+ }
+
+ void run() {
+ testRegex();
+ BSONObjBuilder A,B,C;
+ A.append("x", 2);
+ B.append("x", 2.0);
+ C.append("x", 2.1);
+ BSONObj a = A.done();
+ BSONObj b = B.done();
+ BSONObj c = C.done();
+ assert( !a.binaryEqual( b ) ); // comments on operator==
+ int cmp = a.woCompare(b);
+ assert( cmp == 0 );
+ cmp = a.woCompare(c);
+ assert( cmp < 0 );
+ testoid();
+ testbounds();
+ testorder();
+ }
+ } bson_unittest;
+
+ Labeler::Label GT( "$gt" );
+ Labeler::Label GTE( "$gte" );
+ Labeler::Label LT( "$lt" );
+ Labeler::Label LTE( "$lte" );
+ Labeler::Label NE( "$ne" );
+ Labeler::Label SIZE( "$size" );
+
+ void BSONObjBuilder::appendMinForType( const StringData& fieldName , int t ) {
+ switch ( t ) {
+
+ // Shared canonical types
+ case NumberInt:
+ case NumberDouble:
+ case NumberLong:
+ append( fieldName , - numeric_limits<double>::max() ); return;
+ case Symbol:
+ case String:
+ append( fieldName , "" ); return;
+ case Date:
+ // min varies with V0 and V1 indexes, so we go one type lower.
+ appendBool(fieldName, true);
+ //appendDate( fieldName , numeric_limits<long long>::min() );
+ return;
+ case Timestamp: // TODO integrate with Date SERVER-3304
+ appendTimestamp( fieldName , 0 ); return;
+ case Undefined: // shared with EOO
+ appendUndefined( fieldName ); return;
+
+ // Separate canonical types
+ case MinKey:
+ appendMinKey( fieldName ); return;
+ case MaxKey:
+ appendMaxKey( fieldName ); return;
+ case jstOID: {
+ OID o;
+ memset(&o, 0, sizeof(o));
+ appendOID( fieldName , &o);
+ return;
+ }
+ case Bool:
+ appendBool( fieldName , false); return;
+ case jstNULL:
+ appendNull( fieldName ); return;
+ case Object:
+ append( fieldName , BSONObj() ); return;
+ case Array:
+ appendArray( fieldName , BSONObj() ); return;
+ case BinData:
+ appendBinData( fieldName , 0 , BinDataGeneral , (const char *) 0 ); return;
+ case RegEx:
+ appendRegex( fieldName , "" ); return;
+ case DBRef: {
+ OID o;
+ memset(&o, 0, sizeof(o));
+ appendDBRef( fieldName , "" , o );
+ return;
+ }
+ case Code:
+ appendCode( fieldName , "" ); return;
+ case CodeWScope:
+ appendCodeWScope( fieldName , "" , BSONObj() ); return;
+ };
+ log() << "type not supported for appendMinElementForType: " << t << endl;
+ uassert( 10061 , "type not supported for appendMinElementForType" , false );
+ }
+
+ void BSONObjBuilder::appendMaxForType( const StringData& fieldName , int t ) {
+ switch ( t ) {
+
+ // Shared canonical types
+ case NumberInt:
+ case NumberDouble:
+ case NumberLong:
+ append( fieldName , numeric_limits<double>::max() ); return;
+ case Symbol:
+ case String:
+ appendMinForType( fieldName, Object ); return;
+ case Date:
+ appendDate( fieldName , numeric_limits<long long>::max() ); return;
+ case Timestamp: // TODO integrate with Date SERVER-3304
+ appendTimestamp( fieldName , numeric_limits<unsigned long long>::max() ); return;
+ case Undefined: // shared with EOO
+ appendUndefined( fieldName ); return;
+
+ // Separate canonical types
+ case MinKey:
+ appendMinKey( fieldName ); return;
+ case MaxKey:
+ appendMaxKey( fieldName ); return;
+ case jstOID: {
+ OID o;
+ memset(&o, 0xFF, sizeof(o));
+ appendOID( fieldName , &o);
+ return;
+ }
+ case Bool:
+ appendBool( fieldName , true ); return;
+ case jstNULL:
+ appendNull( fieldName ); return;
+ case Object:
+ appendMinForType( fieldName, Array ); return;
+ case Array:
+ appendMinForType( fieldName, BinData ); return;
+ case BinData:
+ appendMinForType( fieldName, jstOID ); return;
+ case RegEx:
+ appendMinForType( fieldName, DBRef ); return;
+ case DBRef:
+ appendMinForType( fieldName, Code ); return;
+ case Code:
+ appendMinForType( fieldName, CodeWScope ); return;
+ case CodeWScope:
+ // This upper bound may change if a new bson type is added.
+ appendMinForType( fieldName , MaxKey ); return;
+ }
+ log() << "type not supported for appendMaxElementForType: " << t << endl;
+ uassert( 14853 , "type not supported for appendMaxElementForType" , false );
+ }
+
+ int BSONElementFieldSorter( const void * a , const void * b ) {
+ const char * x = *((const char**)a);
+ const char * y = *((const char**)b);
+ x++; y++;
+ return lexNumCmp( x , y );
+ }
+
+ bool fieldsMatch(const BSONObj& lhs, const BSONObj& rhs) {
+ BSONObjIterator l(lhs);
+ BSONObjIterator r(rhs);
+
+ while (l.more() && r.more()){
+ if (strcmp(l.next().fieldName(), r.next().fieldName())) {
+ return false;
+ }
+ }
+
+ return !(l.more() || r.more()); // false if lhs and rhs have diff nFields()
+ }
+
+ BSONObjIteratorSorted::BSONObjIteratorSorted( const BSONObj& o ) {
+ _nfields = o.nFields();
+ _fields = new const char*[_nfields];
+ int x = 0;
+ BSONObjIterator i( o );
+ while ( i.more() ) {
+ _fields[x++] = i.next().rawdata();
+ assert( _fields[x-1] );
+ }
+ assert( x == _nfields );
+ qsort( _fields , _nfields , sizeof(char*) , BSONElementFieldSorter );
+ _cur = 0;
+ }
+
+ bool BSONObjBuilder::appendAsNumber( const StringData& fieldName , const string& data ) {
+ if ( data.size() == 0 || data == "-" || data == ".")
+ return false;
+
+ unsigned int pos=0;
+ if ( data[0] == '-' )
+ pos++;
+
+ bool hasDec = false;
+
+ for ( ; pos<data.size(); pos++ ) {
+ if ( isdigit(data[pos]) )
+ continue;
+
+ if ( data[pos] == '.' ) {
+ if ( hasDec )
+ return false;
+ hasDec = true;
+ continue;
+ }
+
+ return false;
+ }
+
+ if ( hasDec ) {
+ double d = atof( data.c_str() );
+ append( fieldName , d );
+ return true;
+ }
+
+ if ( data.size() < 8 ) {
+ append( fieldName , atoi( data.c_str() ) );
+ return true;
+ }
+
+ try {
+ long long num = boost::lexical_cast<long long>( data );
+ append( fieldName , num );
+ return true;
+ }
+ catch(bad_lexical_cast &) {
+ return false;
+ }
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/jsobj.h b/src/mongo/db/jsobj.h
new file mode 100644
index 00000000000..ae039529fbf
--- /dev/null
+++ b/src/mongo/db/jsobj.h
@@ -0,0 +1,47 @@
+/** @file jsobj.h
+ BSON classes
+*/
+
+/* Copyright 2009 10gen Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ BSONObj and its helpers
+
+ "BSON" stands for "binary JSON" -- ie a binary way to represent objects that would be
+ represented in JSON (plus a few extensions useful for databases & other languages).
+
+ http://www.bsonspec.org/
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "../bson/util/builder.h"
+#include "../util/optime.h"
+//#include "boost/utility.hpp"
+//#include <set>
+#include "../bson/bsontypes.h"
+#include "../bson/oid.h"
+#include "../bson/bsonelement.h"
+#include "../bson/bsonobj.h"
+#include "../bson/bsonmisc.h"
+#include "../bson/bsonobjbuilder.h"
+#include "../bson/bsonobjiterator.h"
+#include "../bson/bson-inl.h"
+#include "../bson/ordering.h"
+#include "../bson/stringdata.h"
+#include "../bson/bson_db.h"
+
diff --git a/src/mongo/db/jsobjmanipulator.h b/src/mongo/db/jsobjmanipulator.h
new file mode 100644
index 00000000000..860e575940e
--- /dev/null
+++ b/src/mongo/db/jsobjmanipulator.h
@@ -0,0 +1,94 @@
+/** jsobjManipulator.h */
+
+/**
+ * Copyright (C) 2009 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "jsobj.h"
+//#include "dur.h"
+
+namespace mongo {
+
+ /** Manipulate the binary representation of a BSONElement in-place.
+ Careful, this casts away const.
+ */
+ class BSONElementManipulator {
+ public:
+ BSONElementManipulator( const BSONElement &element ) :
+ _element( element ) {
+ assert( !_element.eoo() );
+ }
+ /** Replace a Timestamp type with a Date type initialized to
+ OpTime::now().asDate()
+ */
+ void initTimestamp();
+
+ // Note the ones with a capital letter call getDur().writing and journal
+
+ /** Change the value, in place, of the number. */
+ void setNumber(double d) {
+ if ( _element.type() == NumberDouble ) *reinterpret_cast< double * >( value() ) = d;
+ else if ( _element.type() == NumberInt ) *reinterpret_cast< int * >( value() ) = (int) d;
+ else assert(0);
+ }
+ void SetNumber(double d);
+ void setLong(long long n) {
+ assert( _element.type() == NumberLong );
+ *reinterpret_cast< long long * >( value() ) = n;
+ }
+ void SetLong(long long n);
+ void setInt(int n) {
+ assert( _element.type() == NumberInt );
+ *reinterpret_cast< int * >( value() ) = n;
+ }
+ void SetInt(int n);
+
+ /** Replace the type and value of the element with the type and value of e,
+ preserving the original fieldName */
+ void replaceTypeAndValue( const BSONElement &e ) {
+ *data() = e.type();
+ memcpy( value(), e.value(), e.valuesize() );
+ }
+
+ /* dur:: version */
+ void ReplaceTypeAndValue( const BSONElement &e );
+
+ static void lookForTimestamps( const BSONObj& obj ) {
+ // If have a Timestamp field as the first or second element,
+ // update it to a Date field set to OpTime::now().asDate(). The
+ // replacement policy is a work in progress.
+
+ BSONObjIterator i( obj );
+ for( int j = 0; i.moreWithEOO() && j < 2; ++j ) {
+ BSONElement e = i.next();
+ if ( e.eoo() )
+ break;
+ if ( e.type() == Timestamp ) {
+ BSONElementManipulator( e ).initTimestamp();
+ break;
+ }
+ }
+ }
+ private:
+ char *data() { return nonConst( _element.rawdata() ); }
+ char *value() { return nonConst( _element.value() ); }
+ static char *nonConst( const char *s ) { return const_cast< char * >( s ); }
+
+ const BSONElement _element;
+ };
+
+} // namespace mongo
diff --git a/src/mongo/db/json.cpp b/src/mongo/db/json.cpp
new file mode 100644
index 00000000000..73457a2bfbb
--- /dev/null
+++ b/src/mongo/db/json.cpp
@@ -0,0 +1,651 @@
+// json.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#define BOOST_SPIRIT_THREADSAFE
+#if BOOST_VERSION >= 103800
+#define BOOST_SPIRIT_USE_OLD_NAMESPACE
+#include <boost/spirit/include/classic_core.hpp>
+#include <boost/spirit/include/classic_loops.hpp>
+#include <boost/spirit/include/classic_lists.hpp>
+#else
+#include <boost/spirit/core.hpp>
+#include <boost/spirit/utility/loops.hpp>
+#include <boost/spirit/utility/lists.hpp>
+#endif
+#undef assert
+#define assert MONGO_assert
+
+#include "json.h"
+#include "../bson/util/builder.h"
+#include "../util/base64.h"
+#include "../util/hex.h"
+
+
+using namespace boost::spirit;
+
+namespace mongo {
+
+ struct ObjectBuilder : boost::noncopyable {
+ ~ObjectBuilder() {
+ unsigned i = builders.size();
+ if ( i ) {
+ i--;
+ for ( ; i>=1; i-- ) {
+ if ( builders[i] ) {
+ builders[i]->done();
+ }
+ }
+ }
+ }
+ BSONObjBuilder *back() {
+ return builders.back().get();
+ }
+ // Storage for field names of elements within builders.back().
+ const char *fieldName() {
+ return fieldNames.back().c_str();
+ }
+ bool empty() const {
+ return builders.size() == 0;
+ }
+ void init() {
+ boost::shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+ builders.push_back( b );
+ fieldNames.push_back( "" );
+ indexes.push_back( 0 );
+ }
+ void pushObject( const char *fieldName ) {
+ boost::shared_ptr< BSONObjBuilder > b( new BSONObjBuilder( builders.back()->subobjStart( fieldName ) ) );
+ builders.push_back( b );
+ fieldNames.push_back( "" );
+ indexes.push_back( 0 );
+ }
+ void pushArray( const char *fieldName ) {
+ boost::shared_ptr< BSONObjBuilder > b( new BSONObjBuilder( builders.back()->subarrayStart( fieldName ) ) );
+ builders.push_back( b );
+ fieldNames.push_back( "" );
+ indexes.push_back( 0 );
+ }
+ BSONObj pop() {
+ BSONObj ret;
+ if ( back()->owned() )
+ ret = back()->obj();
+ else
+ ret = back()->done();
+ builders.pop_back();
+ fieldNames.pop_back();
+ indexes.pop_back();
+ return ret;
+ }
+ void nameFromIndex() {
+ fieldNames.back() = BSONObjBuilder::numStr( indexes.back() );
+ }
+ string popString() {
+ string ret = ss.str();
+ ss.str( "" );
+ return ret;
+ }
+ // Cannot use auto_ptr because its copy constructor takes a non const reference.
+ vector< boost::shared_ptr< BSONObjBuilder > > builders;
+ vector< string > fieldNames;
+ vector< int > indexes;
+ stringstream ss;
+ string ns;
+ OID oid;
+ string binData;
+ BinDataType binDataType;
+ string regex;
+ string regexOptions;
+ Date_t date;
+ OpTime timestamp;
+ };
+
+ struct objectStart {
+ objectStart( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char &c ) const {
+ if ( b.empty() )
+ b.init();
+ else
+ b.pushObject( b.fieldName() );
+ }
+ ObjectBuilder &b;
+ };
+
+ struct arrayStart {
+ arrayStart( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char &c ) const {
+ b.pushArray( b.fieldName() );
+ b.nameFromIndex();
+ }
+ ObjectBuilder &b;
+ };
+
+ struct arrayNext {
+ arrayNext( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char &c ) const {
+ ++b.indexes.back();
+ b.nameFromIndex();
+ }
+ ObjectBuilder &b;
+ };
+
+ struct ch {
+ ch( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char c ) const {
+ b.ss << c;
+ }
+ ObjectBuilder &b;
+ };
+
+ struct chE {
+ chE( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char c ) const {
+ char o = '\0';
+ switch ( c ) {
+ case '\"':
+ o = '\"';
+ break;
+ case '\'':
+ o = '\'';
+ break;
+ case '\\':
+ o = '\\';
+ break;
+ case '/':
+ o = '/';
+ break;
+ case 'b':
+ o = '\b';
+ break;
+ case 'f':
+ o = '\f';
+ break;
+ case 'n':
+ o = '\n';
+ break;
+ case 'r':
+ o = '\r';
+ break;
+ case 't':
+ o = '\t';
+ break;
+ case 'v':
+ o = '\v';
+ break;
+ default:
+ assert( false );
+ }
+ b.ss << o;
+ }
+ ObjectBuilder &b;
+ };
+
+ struct chU {
+ chU( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char *start, const char *end ) const {
+ unsigned char first = fromHex( start );
+ unsigned char second = fromHex( start + 2 );
+ if ( first == 0 && second < 0x80 )
+ b.ss << second;
+ else if ( first < 0x08 ) {
+ b.ss << char( 0xc0 | ( ( first << 2 ) | ( second >> 6 ) ) );
+ b.ss << char( 0x80 | ( ~0xc0 & second ) );
+ }
+ else {
+ b.ss << char( 0xe0 | ( first >> 4 ) );
+ b.ss << char( 0x80 | ( ~0xc0 & ( ( first << 2 ) | ( second >> 6 ) ) ) );
+ b.ss << char( 0x80 | ( ~0xc0 & second ) );
+ }
+ }
+ ObjectBuilder &b;
+ };
+
+ struct chClear {
+ chClear( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char c ) const {
+ b.popString();
+ }
+ ObjectBuilder &b;
+ };
+
+ struct fieldNameEnd {
+ fieldNameEnd( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char *start, const char *end ) const {
+ string name = b.popString();
+ massert( 10338 , "Invalid use of reserved field name: " + name,
+ name != "$oid" &&
+ name != "$binary" &&
+ name != "$type" &&
+ name != "$date" &&
+ name != "$timestamp" &&
+ name != "$regex" &&
+ name != "$options" );
+ b.fieldNames.back() = name;
+ }
+ ObjectBuilder &b;
+ };
+
+ struct unquotedFieldNameEnd {
+ unquotedFieldNameEnd( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char *start, const char *end ) const {
+ string name( start, end );
+ b.fieldNames.back() = name;
+ }
+ ObjectBuilder &b;
+ };
+
+ struct stringEnd {
+ stringEnd( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char *start, const char *end ) const {
+ b.back()->append( b.fieldName(), b.popString() );
+ }
+ ObjectBuilder &b;
+ };
+
+ struct numberValue {
+ numberValue( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char *start, const char *end ) const {
+ string raw(start);
+ double val;
+
+ // strtod isn't able to deal with NaN and inf in a portable way.
+ // Correspondingly, we perform the conversions explicitly.
+
+ if ( ! raw.compare(0, 3, "NaN" ) ) {
+ val = std::numeric_limits<double>::quiet_NaN();
+ }
+ else if ( ! raw.compare(0, 8, "Infinity" ) ) {
+ val = std::numeric_limits<double>::infinity();
+ }
+ else if ( ! raw.compare(0, 9, "-Infinity" ) ) {
+ val = -std::numeric_limits<double>::infinity();
+ }
+ else {
+ // We re-parse the numeric string here because spirit parsing of strings
+ // to doubles produces different results from strtod in some cases and
+ // we want to use strtod to ensure consistency with other string to
+ // double conversions in our code.
+
+ val = strtod( start, 0 );
+ }
+
+ b.back()->append( b.fieldName(), val );
+ }
+ ObjectBuilder &b;
+ };
+
+ struct intValue {
+ intValue( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( long long num ) const {
+ if (num >= numeric_limits<int>::min() && num <= numeric_limits<int>::max())
+ b.back()->append( b.fieldName(), (int)num );
+ else
+ b.back()->append( b.fieldName(), num );
+ }
+ ObjectBuilder &b;
+ };
+
+ struct subobjectEnd {
+ subobjectEnd( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char *start, const char *end ) const {
+ b.pop();
+ }
+ ObjectBuilder &b;
+ };
+
+ struct arrayEnd {
+ arrayEnd( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char *start, const char *end ) const {
+ b.pop();
+ }
+ ObjectBuilder &b;
+ };
+
+ struct trueValue {
+ trueValue( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char *start, const char *end ) const {
+ b.back()->appendBool( b.fieldName(), true );
+ }
+ ObjectBuilder &b;
+ };
+
+ struct falseValue {
+ falseValue( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char *start, const char *end ) const {
+ b.back()->appendBool( b.fieldName(), false );
+ }
+ ObjectBuilder &b;
+ };
+
+ struct nullValue {
+ nullValue( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char *start, const char *end ) const {
+ b.back()->appendNull( b.fieldName() );
+ }
+ ObjectBuilder &b;
+ };
+
+ struct undefinedValue {
+ undefinedValue( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char *start, const char *end ) const {
+ b.back()->appendUndefined( b.fieldName() );
+ }
+ ObjectBuilder &b;
+ };
+
+ struct dbrefNS {
+ dbrefNS( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char *start, const char *end ) const {
+ b.ns = b.popString();
+ }
+ ObjectBuilder &b;
+ };
+
+// NOTE s must be 24 characters.
+ OID stringToOid( const char *s ) {
+ OID oid;
+ char *oidP = (char *)( &oid );
+ for ( int i = 0; i < 12; ++i )
+ oidP[ i ] = fromHex( s + ( i * 2 ) );
+ return oid;
+ }
+
+ struct oidValue {
+ oidValue( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char *start, const char *end ) const {
+ b.oid = stringToOid( start );
+ }
+ ObjectBuilder &b;
+ };
+
+ struct dbrefEnd {
+ dbrefEnd( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char *start, const char *end ) const {
+ b.back()->appendDBRef( b.fieldName(), b.ns, b.oid );
+ }
+ ObjectBuilder &b;
+ };
+
+ struct oidEnd {
+ oidEnd( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char *start, const char *end ) const {
+ b.back()->appendOID( b.fieldName(), &b.oid );
+ }
+ ObjectBuilder &b;
+ };
+
+ struct timestampEnd {
+ timestampEnd( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char *start, const char *end ) const {
+ b.back()->appendTimestamp( b.fieldName(), b.timestamp.asDate() );
+ }
+ ObjectBuilder &b;
+ };
+
+ struct binDataBinary {
+ binDataBinary( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char *start, const char *end ) const {
+ massert( 10339 , "Badly formatted bindata", ( end - start ) % 4 == 0 );
+ string encoded( start, end );
+ b.binData = base64::decode( encoded );
+ }
+ ObjectBuilder &b;
+ };
+
+ struct binDataType {
+ binDataType( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char *start, const char *end ) const {
+ b.binDataType = BinDataType( fromHex( start ) );
+ }
+ ObjectBuilder &b;
+ };
+
+ struct binDataEnd {
+ binDataEnd( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char *start, const char *end ) const {
+ b.back()->appendBinData( b.fieldName(), b.binData.length(),
+ b.binDataType, b.binData.data() );
+ }
+ ObjectBuilder &b;
+ };
+
+ struct timestampSecs {
+ timestampSecs( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( unsigned long long x) const {
+ b.timestamp = OpTime( (unsigned) (x/1000) , 0);
+ }
+ ObjectBuilder &b;
+ };
+
+ struct timestampInc {
+ timestampInc( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( unsigned x) const {
+ b.timestamp = OpTime(b.timestamp.getSecs(), x);
+ }
+ ObjectBuilder &b;
+ };
+
+ struct dateValue {
+ dateValue( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( Date_t v ) const {
+ b.date = v;
+ }
+ ObjectBuilder &b;
+ };
+
+ struct dateEnd {
+ dateEnd( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char *start, const char *end ) const {
+ b.back()->appendDate( b.fieldName(), b.date );
+ }
+ ObjectBuilder &b;
+ };
+
+ struct regexValue {
+ regexValue( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char *start, const char *end ) const {
+ b.regex = b.popString();
+ }
+ ObjectBuilder &b;
+ };
+
+ struct regexOptions {
+ regexOptions( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char *start, const char *end ) const {
+ b.regexOptions = string( start, end );
+ }
+ ObjectBuilder &b;
+ };
+
+ struct regexEnd {
+ regexEnd( ObjectBuilder &_b ) : b( _b ) {}
+ void operator() ( const char *start, const char *end ) const {
+ b.back()->appendRegex( b.fieldName(), b.regex, b.regexOptions );
+ }
+ ObjectBuilder &b;
+ };
+
+// One gotcha with this parsing library is probably best ilustrated with an
+// example. Say we have a production like this:
+// z = ( ch_p( 'a' )[ foo ] >> ch_p( 'b' ) ) | ( ch_p( 'a' )[ foo ] >> ch_p( 'c' ) );
+// On input "ac", action foo() will be called twice -- once as the parser tries
+// to match "ab", again as the parser successfully matches "ac". Sometimes
+// the grammar can be modified to eliminate these situations. Here, for example:
+// z = ch_p( 'a' )[ foo ] >> ( ch_p( 'b' ) | ch_p( 'c' ) );
+// However, this is not always possible. In my implementation I've tried to
+// stick to the following pattern: store fields fed to action callbacks
+// temporarily as ObjectBuilder members, then append to a BSONObjBuilder once
+// the parser has completely matched a nonterminal and won't backtrack. It's
+// worth noting here that this parser follows a short-circuit convention. So,
+// in the original z example on line 3, if the input was "ab", foo() would only
+// be called once.
+ struct JsonGrammar : public grammar< JsonGrammar > {
+ public:
+ JsonGrammar( ObjectBuilder &_b ) : b( _b ) {}
+
+ template < typename ScannerT >
+ struct definition {
+ definition( JsonGrammar const &self ) {
+ object = ch_p( '{' )[ objectStart( self.b ) ] >> !members >> '}';
+ members = list_p((fieldName >> ':' >> value) , ',');
+ fieldName =
+ str[ fieldNameEnd( self.b ) ] |
+ singleQuoteStr[ fieldNameEnd( self.b ) ] |
+ unquotedFieldName[ unquotedFieldNameEnd( self.b ) ];
+ array = ch_p( '[' )[ arrayStart( self.b ) ] >> !elements >> ']';
+ elements = list_p(value, ch_p(',')[arrayNext( self.b )]);
+ value =
+ str[ stringEnd( self.b ) ] |
+ number[ numberValue( self.b ) ] |
+ integer |
+ array[ arrayEnd( self.b ) ] |
+ lexeme_d[ str_p( "true" ) ][ trueValue( self.b ) ] |
+ lexeme_d[ str_p( "false" ) ][ falseValue( self.b ) ] |
+ lexeme_d[ str_p( "null" ) ][ nullValue( self.b ) ] |
+ lexeme_d[ str_p( "undefined" ) ][ undefinedValue( self.b ) ] |
+ singleQuoteStr[ stringEnd( self.b ) ] |
+ date[ dateEnd( self.b ) ] |
+ oid[ oidEnd( self.b ) ] |
+ bindata[ binDataEnd( self.b ) ] |
+ dbref[ dbrefEnd( self.b ) ] |
+ timestamp[ timestampEnd( self.b ) ] |
+ regex[ regexEnd( self.b ) ] |
+ object[ subobjectEnd( self.b ) ] ;
+ // NOTE lexeme_d and rules don't mix well, so we have this mess.
+ // NOTE We use range_p rather than cntrl_p, because the latter is locale dependent.
+ str = lexeme_d[ ch_p( '"' )[ chClear( self.b ) ] >>
+ *( ( ch_p( '\\' ) >>
+ (
+ ch_p( 'b' )[ chE( self.b ) ] |
+ ch_p( 'f' )[ chE( self.b ) ] |
+ ch_p( 'n' )[ chE( self.b ) ] |
+ ch_p( 'r' )[ chE( self.b ) ] |
+ ch_p( 't' )[ chE( self.b ) ] |
+ ch_p( 'v' )[ chE( self.b ) ] |
+ ( ch_p( 'u' ) >> ( repeat_p( 4 )[ xdigit_p ][ chU( self.b ) ] ) ) |
+ ( ~ch_p('x') & (~range_p('0','9'))[ ch( self.b ) ] ) // hex and octal aren't supported
+ )
+ ) |
+ ( ~range_p( 0x00, 0x1f ) & ~ch_p( '"' ) & ( ~ch_p( '\\' ) )[ ch( self.b ) ] ) ) >> '"' ];
+
+ singleQuoteStr = lexeme_d[ ch_p( '\'' )[ chClear( self.b ) ] >>
+ *( ( ch_p( '\\' ) >>
+ (
+ ch_p( 'b' )[ chE( self.b ) ] |
+ ch_p( 'f' )[ chE( self.b ) ] |
+ ch_p( 'n' )[ chE( self.b ) ] |
+ ch_p( 'r' )[ chE( self.b ) ] |
+ ch_p( 't' )[ chE( self.b ) ] |
+ ch_p( 'v' )[ chE( self.b ) ] |
+ ( ch_p( 'u' ) >> ( repeat_p( 4 )[ xdigit_p ][ chU( self.b ) ] ) ) |
+ ( ~ch_p('x') & (~range_p('0','9'))[ ch( self.b ) ] ) // hex and octal aren't supported
+ )
+ ) |
+ ( ~range_p( 0x00, 0x1f ) & ~ch_p( '\'' ) & ( ~ch_p( '\\' ) )[ ch( self.b ) ] ) ) >> '\'' ];
+
+ // real_p accepts numbers with nonsignificant zero prefixes, which
+ // aren't allowed in JSON. Oh well.
+ number = strict_real_p | str_p( "NaN" ) | str_p( "Infinity" ) | str_p( "-Infinity" );
+
+ static int_parser<long long, 10, 1, numeric_limits<long long>::digits10 + 1> long_long_p;
+ integer = long_long_p[ intValue(self.b) ];
+
+ // We allow a subset of valid js identifier names here.
+ unquotedFieldName = lexeme_d[ ( alpha_p | ch_p( '$' ) | ch_p( '_' ) ) >> *( ( alnum_p | ch_p( '$' ) | ch_p( '_' )) ) ];
+
+ dbref = dbrefS | dbrefT;
+ dbrefS = ch_p( '{' ) >> "\"$ref\"" >> ':' >>
+ str[ dbrefNS( self.b ) ] >> ',' >> "\"$id\"" >> ':' >> quotedOid >> '}';
+ dbrefT = str_p( "Dbref" ) >> '(' >> str[ dbrefNS( self.b ) ] >> ',' >>
+ quotedOid >> ')';
+
+ timestamp = ch_p( '{' ) >> "\"$timestamp\"" >> ':' >> '{' >>
+ "\"t\"" >> ':' >> uint_parser<unsigned long long, 10, 1, -1>()[ timestampSecs(self.b) ] >> ',' >>
+ "\"i\"" >> ':' >> uint_parser<unsigned int, 10, 1, -1>()[ timestampInc(self.b) ] >> '}' >>'}';
+
+ oid = oidS | oidT;
+ oidS = ch_p( '{' ) >> "\"$oid\"" >> ':' >> quotedOid >> '}';
+ oidT = str_p( "ObjectId" ) >> '(' >> quotedOid >> ')';
+
+ quotedOid = lexeme_d[ '"' >> ( repeat_p( 24 )[ xdigit_p ] )[ oidValue( self.b ) ] >> '"' ];
+
+ bindata = ch_p( '{' ) >> "\"$binary\"" >> ':' >>
+ lexeme_d[ '"' >> ( *( range_p( 'A', 'Z' ) | range_p( 'a', 'z' ) | range_p( '0', '9' ) | ch_p( '+' ) | ch_p( '/' ) ) >> *ch_p( '=' ) )[ binDataBinary( self.b ) ] >> '"' ] >> ',' >> "\"$type\"" >> ':' >>
+ lexeme_d[ '"' >> ( repeat_p( 2 )[ xdigit_p ] )[ binDataType( self.b ) ] >> '"' ] >> '}';
+
+ // TODO: this will need to use a signed parser at some point
+ date = dateS | dateT;
+ dateS = ch_p( '{' ) >> "\"$date\"" >> ':' >> uint_parser< Date_t >()[ dateValue( self.b ) ] >> '}';
+ dateT = !str_p("new") >> str_p( "Date" ) >> '(' >> uint_parser< Date_t >()[ dateValue( self.b ) ] >> ')';
+
+ regex = regexS | regexT;
+ regexS = ch_p( '{' ) >> "\"$regex\"" >> ':' >> str[ regexValue( self.b ) ] >> ',' >> "\"$options\"" >> ':' >> lexeme_d[ '"' >> ( *( alpha_p ) )[ regexOptions( self.b ) ] >> '"' ] >> '}';
+ // FIXME Obviously it would be nice to unify this with str.
+ regexT = lexeme_d[ ch_p( '/' )[ chClear( self.b ) ] >>
+ *( ( ch_p( '\\' ) >>
+ ( ch_p( '"' )[ chE( self.b ) ] |
+ ch_p( '\\' )[ chE( self.b ) ] |
+ ch_p( '/' )[ chE( self.b ) ] |
+ ch_p( 'b' )[ chE( self.b ) ] |
+ ch_p( 'f' )[ chE( self.b ) ] |
+ ch_p( 'n' )[ chE( self.b ) ] |
+ ch_p( 'r' )[ chE( self.b ) ] |
+ ch_p( 't' )[ chE( self.b ) ] |
+ ( ch_p( 'u' ) >> ( repeat_p( 4 )[ xdigit_p ][ chU( self.b ) ] ) ) ) ) |
+ ( ~range_p( 0x00, 0x1f ) & ~ch_p( '/' ) & ( ~ch_p( '\\' ) )[ ch( self.b ) ] ) ) >> str_p( "/" )[ regexValue( self.b ) ]
+ >> ( *( ch_p( 'i' ) | ch_p( 'g' ) | ch_p( 'm' ) ) )[ regexOptions( self.b ) ] ];
+ }
+ rule< ScannerT > object, members, array, elements, value, str, number, integer,
+ dbref, dbrefS, dbrefT, timestamp, timestampS, timestampT, oid, oidS, oidT,
+ bindata, date, dateS, dateT, regex, regexS, regexT, quotedOid, fieldName,
+ unquotedFieldName, singleQuoteStr;
+ const rule< ScannerT > &start() const {
+ return object;
+ }
+ };
+ ObjectBuilder &b;
+ };
+
+ BSONObj fromjson( const char *str , int* len) {
+ if ( str[0] == '\0' ) {
+ if (len) *len = 0;
+ return BSONObj();
+ }
+
+ ObjectBuilder b;
+ JsonGrammar parser( b );
+ parse_info<> result = parse( str, parser, space_p );
+ if (len) {
+ *len = result.stop - str;
+ }
+ else if ( !result.full ) {
+ int limit = strnlen(result.stop , 10);
+ if (limit == -1) limit = 10;
+ msgasserted(10340, "Failure parsing JSON string near: " + string( result.stop, limit ));
+ }
+ BSONObj ret = b.pop();
+ assert( b.empty() );
+ return ret;
+ }
+
+ BSONObj fromjson( const string &str ) {
+ return fromjson( str.c_str() );
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/json.h b/src/mongo/db/json.h
new file mode 100644
index 00000000000..68dae042574
--- /dev/null
+++ b/src/mongo/db/json.h
@@ -0,0 +1,41 @@
+/** @file json.h */
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "jsobj.h"
+
+namespace mongo {
+
+ /** Create a BSONObj from a JSON <http://www.json.org> string. In addition
+ to the JSON extensions extensions described here
+ <http://mongodb.onconfluence.com/display/DOCS/Mongo+Extended+JSON>,
+ this function accepts certain unquoted field names and allows single quotes
+ to optionally be used when specifying field names and string values instead
+ of double quotes. JSON unicode escape sequences (of the form \uXXXX) are
+ converted to utf8.
+ \throws MsgAssertionException if parsing fails. The message included with
+ this assertion includes a rough indication of where parsing failed.
+ */
+ BSONObj fromjson(const string &str);
+
+ /** len will be size of JSON object in text chars. */
+ BSONObj fromjson(const char *str, int* len=NULL);
+
+} // namespace mongo
diff --git a/src/mongo/db/key.cpp b/src/mongo/db/key.cpp
new file mode 100644
index 00000000000..47449986d21
--- /dev/null
+++ b/src/mongo/db/key.cpp
@@ -0,0 +1,678 @@
+// @file key.cpp
+
+/**
+* Copyright (C) 2011 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "key.h"
+#include "../util/unittest.h"
+
+namespace mongo {
+
+ extern const Ordering nullOrdering = Ordering::make(BSONObj());
+
+ // KeyBson is for V0 (version #0) indexes
+
+ int oldCompare(const BSONObj& l,const BSONObj& r, const Ordering &o);
+
+ // "old" = pre signed dates & such; i.e. btree V0
+ /* must be same canon type when called */
+ int oldCompareElementValues(const BSONElement& l, const BSONElement& r) {
+ dassert( l.canonicalType() == r.canonicalType() );
+ int f;
+ double x;
+
+ switch ( l.type() ) {
+ case EOO:
+ case Undefined: // EOO and Undefined are same canonicalType
+ case jstNULL:
+ case MaxKey:
+ case MinKey:
+ return 0;
+ case Bool:
+ return *l.value() - *r.value();
+ case Timestamp:
+ case Date:
+ // unsigned dates for old version
+ if ( l.date() < r.date() )
+ return -1;
+ return l.date() == r.date() ? 0 : 1;
+ case NumberLong:
+ if( r.type() == NumberLong ) {
+ long long L = l._numberLong();
+ long long R = r._numberLong();
+ if( L < R ) return -1;
+ if( L == R ) return 0;
+ return 1;
+ }
+ // else fall through
+ case NumberInt:
+ case NumberDouble: {
+ double left = l.number();
+ double right = r.number();
+ bool lNan = !( left <= numeric_limits< double >::max() &&
+ left >= -numeric_limits< double >::max() );
+ bool rNan = !( right <= numeric_limits< double >::max() &&
+ right >= -numeric_limits< double >::max() );
+ if ( lNan ) {
+ if ( rNan ) {
+ return 0;
+ }
+ else {
+ return -1;
+ }
+ }
+ else if ( rNan ) {
+ return 1;
+ }
+ x = left - right;
+ if ( x < 0 ) return -1;
+ return x == 0 ? 0 : 1;
+ }
+ case jstOID:
+ return memcmp(l.value(), r.value(), 12);
+ case Code:
+ case Symbol:
+ case String:
+ // nulls not allowed in the middle of strings in the old version
+ return strcmp(l.valuestr(), r.valuestr());
+ case Object:
+ case Array:
+ return oldCompare(l.embeddedObject(), r.embeddedObject(), nullOrdering);
+ case DBRef: {
+ int lsz = l.valuesize();
+ int rsz = r.valuesize();
+ if ( lsz - rsz != 0 ) return lsz - rsz;
+ return memcmp(l.value(), r.value(), lsz);
+ }
+ case BinData: {
+ int lsz = l.objsize(); // our bin data size in bytes, not including the subtype byte
+ int rsz = r.objsize();
+ if ( lsz - rsz != 0 ) return lsz - rsz;
+ return memcmp(l.value()+4, r.value()+4, lsz+1);
+ }
+ case RegEx: {
+ int c = strcmp(l.regex(), r.regex());
+ if ( c )
+ return c;
+ return strcmp(l.regexFlags(), r.regexFlags());
+ }
+ case CodeWScope : {
+ f = l.canonicalType() - r.canonicalType();
+ if ( f )
+ return f;
+ f = strcmp( l.codeWScopeCode() , r.codeWScopeCode() );
+ if ( f )
+ return f;
+ f = strcmp( l.codeWScopeScopeData() , r.codeWScopeScopeData() );
+ if ( f )
+ return f;
+ return 0;
+ }
+ default:
+ out() << "oldCompareElementValues: bad type " << (int) l.type() << endl;
+ assert(false);
+ }
+ return -1;
+ }
+
+ int oldElemCompare(const BSONElement&l , const BSONElement& r) {
+ int lt = (int) l.canonicalType();
+ int rt = (int) r.canonicalType();
+ int x = lt - rt;
+ if( x )
+ return x;
+ return oldCompareElementValues(l, r);
+ }
+
+ // pre signed dates & such
+ int oldCompare(const BSONObj& l,const BSONObj& r, const Ordering &o) {
+ BSONObjIterator i(l);
+ BSONObjIterator j(r);
+ unsigned mask = 1;
+ while ( 1 ) {
+ // so far, equal...
+
+ BSONElement l = i.next();
+ BSONElement r = j.next();
+ if ( l.eoo() )
+ return r.eoo() ? 0 : -1;
+ if ( r.eoo() )
+ return 1;
+
+ int x;
+ {
+ x = oldElemCompare(l, r);
+ if( o.descending(mask) )
+ x = -x;
+ }
+ if ( x != 0 )
+ return x;
+ mask <<= 1;
+ }
+ return -1;
+ }
+
+ /* old style compares:
+ - dates are unsigned
+ - strings no nulls
+ */
+ int KeyBson::woCompare(const KeyBson& r, const Ordering &o) const {
+ return oldCompare(_o, r._o, o);
+ }
+
+ // woEqual could be made faster than woCompare but this is for backward compatibility so not worth a big effort
+ bool KeyBson::woEqual(const KeyBson& r) const {
+ return oldCompare(_o, r._o, nullOrdering) == 0;
+ }
+
+ // [ ][HASMORE][x][y][canontype_4bits]
+ enum CanonicalsEtc {
+ cminkey=1,
+ cnull=2,
+ cdouble=4,
+ cstring=6,
+ cbindata=7,
+ coid=8,
+ cfalse=10,
+ ctrue=11,
+ cdate=12,
+ cmaxkey=14,
+ cCANONTYPEMASK = 0xf,
+ cY = 0x10,
+ cint = cY | cdouble,
+ cX = 0x20,
+ clong = cX | cdouble,
+ cHASMORE = 0x40,
+ cNOTUSED = 0x80 // but see IsBSON sentinel - this bit not usable without great care
+ };
+
+ // bindata bson type
+ const unsigned BinDataLenMask = 0xf0; // lengths are powers of 2 of this value
+ const unsigned BinDataTypeMask = 0x0f; // 0-7 as you would expect, 8-15 are 128+value. see BinDataType.
+ const int BinDataLenMax = 32;
+ const int BinDataLengthToCode[] = {
+ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, -1/*9*/, 0x90/*10*/, -1/*11*/, 0xa0/*12*/, -1/*13*/, 0xb0/*14*/, -1/*15*/,
+ 0xc0/*16*/, -1, -1, -1, 0xd0/*20*/, -1, -1, -1,
+ 0xe0/*24*/, -1, -1, -1, -1, -1, -1, -1,
+ 0xf0/*32*/
+ };
+ const int BinDataCodeToLength[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 32
+ };
+
+ int binDataCodeToLength(int codeByte) {
+ return BinDataCodeToLength[codeByte >> 4];
+ }
+
+ /** object cannot be represented in compact format. so store in traditional bson format
+ with a leading sentinel byte IsBSON to indicate it's in that format.
+
+ Given that the KeyV1Owned constructor already grabbed a bufbuilder, we reuse it here
+ so that we don't have to do an extra malloc.
+ */
+ void KeyV1Owned::traditional(const BSONObj& obj) {
+ b.reset();
+ b.appendUChar(IsBSON);
+ b.appendBuf(obj.objdata(), obj.objsize());
+ _keyData = (const unsigned char *) b.buf();
+ }
+
+ KeyV1Owned::KeyV1Owned(const KeyV1& rhs) {
+ b.appendBuf( rhs.data(), rhs.dataSize() );
+ _keyData = (const unsigned char *) b.buf();
+ dassert( b.len() == dataSize() ); // check datasize method is correct
+ dassert( (*_keyData & cNOTUSED) == 0 );
+ }
+
+ // fromBSON to Key format
+ KeyV1Owned::KeyV1Owned(const BSONObj& obj) {
+ BSONObj::iterator i(obj);
+ unsigned char bits = 0;
+ while( 1 ) {
+ BSONElement e = i.next();
+ if( i.more() )
+ bits |= cHASMORE;
+ switch( e.type() ) {
+ case MinKey:
+ b.appendUChar(cminkey|bits);
+ break;
+ case jstNULL:
+ b.appendUChar(cnull|bits);
+ break;
+ case MaxKey:
+ b.appendUChar(cmaxkey|bits);
+ break;
+ case Bool:
+ b.appendUChar( (e.boolean()?ctrue:cfalse) | bits );
+ break;
+ case jstOID:
+ b.appendUChar(coid|bits);
+ b.appendBuf(&e.__oid(), sizeof(OID));
+ break;
+ case BinData:
+ {
+ int t = e.binDataType();
+ // 0-7 and 0x80 to 0x87 are supported by Key
+ if( (t & 0x78) == 0 && t != ByteArrayDeprecated ) {
+ int len;
+ const char * d = e.binData(len);
+ if( len <= BinDataLenMax ) {
+ int code = BinDataLengthToCode[len];
+ if( code >= 0 ) {
+ if( t >= 128 )
+ t = (t-128) | 0x08;
+ dassert( (code&t) == 0 );
+ b.appendUChar( cbindata|bits );
+ b.appendUChar( code | t );
+ b.appendBuf(d, len);
+ break;
+ }
+ }
+ }
+ traditional(obj);
+ return;
+ }
+ case Date:
+ b.appendUChar(cdate|bits);
+ b.appendStruct(e.date());
+ break;
+ case String:
+ {
+ b.appendUChar(cstring|bits);
+ // note we do not store the terminating null, to save space.
+ unsigned x = (unsigned) e.valuestrsize() - 1;
+ if( x > 255 ) {
+ traditional(obj);
+ return;
+ }
+ b.appendUChar(x);
+ b.appendBuf(e.valuestr(), x);
+ break;
+ }
+ case NumberInt:
+ b.appendUChar(cint|bits);
+ b.appendNum((double) e._numberInt());
+ break;
+ case NumberLong:
+ {
+ long long n = e._numberLong();
+ long long m = 2LL << 52;
+ DEV {
+ long long d = m-1;
+ assert( ((long long) ((double) -d)) == -d );
+ }
+ if( n >= m || n <= -m ) {
+ // can't represent exactly as a double
+ traditional(obj);
+ return;
+ }
+ b.appendUChar(clong|bits);
+ b.appendNum((double) n);
+ break;
+ }
+ case NumberDouble:
+ {
+ double d = e._numberDouble();
+ if( isNaN(d) ) {
+ traditional(obj);
+ return;
+ }
+ b.appendUChar(cdouble|bits);
+ b.appendNum(d);
+ break;
+ }
+ default:
+ // if other types involved, store as traditional BSON
+ traditional(obj);
+ return;
+ }
+ if( !i.more() )
+ break;
+ bits = 0;
+ }
+ _keyData = (const unsigned char *) b.buf();
+ dassert( b.len() == dataSize() ); // check datasize method is correct
+ dassert( (*_keyData & cNOTUSED) == 0 );
+ }
+
+ BSONObj KeyV1::toBson() const {
+ assert( _keyData != 0 );
+ if( !isCompactFormat() )
+ return bson();
+
+ BSONObjBuilder b(512);
+ const unsigned char *p = _keyData;
+ while( 1 ) {
+ unsigned bits = *p++;
+
+ switch( bits & 0x3f ) {
+ case cminkey: b.appendMinKey(""); break;
+ case cnull: b.appendNull(""); break;
+ case cfalse: b.appendBool("", false); break;
+ case ctrue: b.appendBool("", true); break;
+ case cmaxkey:
+ b.appendMaxKey("");
+ break;
+ case cstring:
+ {
+ unsigned sz = *p++;
+ // we build the element ourself as we have to null terminate it
+ BufBuilder &bb = b.bb();
+ bb.appendNum((char) String);
+ bb.appendUChar(0); // fieldname ""
+ bb.appendNum(sz+1);
+ bb.appendBuf(p, sz);
+ bb.appendUChar(0); // null char at end of string
+ p += sz;
+ break;
+ }
+ case coid:
+ b.appendOID("", (OID *) p);
+ p += sizeof(OID);
+ break;
+ case cbindata:
+ {
+ int len = binDataCodeToLength(*p);
+ int subtype = (*p) & BinDataTypeMask;
+ if( subtype & 0x8 ) {
+ subtype = (subtype & 0x7) | 0x80;
+ }
+ b.appendBinData("", len, (BinDataType) subtype, ++p);
+ p += len;
+ break;
+ }
+ case cdate:
+ b.appendDate("", (Date_t&) *p);
+ p += 8;
+ break;
+ case cdouble:
+ b.append("", (double&) *p);
+ p += sizeof(double);
+ break;
+ case cint:
+ b.append("", (int) ((double&) *p));
+ p += sizeof(double);
+ break;
+ case clong:
+ b.append("", (long long) ((double&) *p));
+ p += sizeof(double);
+ break;
+ default:
+ assert(false);
+ }
+
+ if( (bits & cHASMORE) == 0 )
+ break;
+ }
+ return b.obj();
+ }
+
+ static int compare(const unsigned char *&l, const unsigned char *&r) {
+ int lt = (*l & cCANONTYPEMASK);
+ int rt = (*r & cCANONTYPEMASK);
+ int x = lt - rt;
+ if( x )
+ return x;
+
+ l++; r++;
+
+ // same type
+ switch( lt ) {
+ case cdouble:
+ {
+ double L = *((double *) l);
+ double R = *((double *) r);
+ if( L < R )
+ return -1;
+ if( L != R )
+ return 1;
+ l += 8; r += 8;
+ break;
+ }
+ case cstring:
+ {
+ int lsz = *l;
+ int rsz = *r;
+ int common = min(lsz, rsz);
+ l++; r++; // skip the size byte
+ // use memcmp as we (will) allow zeros in UTF8 strings
+ int res = memcmp(l, r, common);
+ if( res )
+ return res;
+ // longer string is the greater one
+ int diff = lsz-rsz;
+ if( diff )
+ return diff;
+ l += lsz; r += lsz;
+ break;
+ }
+ case cbindata:
+ {
+ int L = *l;
+ int R = *r;
+ int llen = binDataCodeToLength(L);
+ int diff = L-R; // checks length and subtype simultaneously
+ if( diff ) {
+ // unfortunately nibbles are backwards to do subtype and len in one check (could bit swap...)
+ int rlen = binDataCodeToLength(R);
+ if( llen != rlen )
+ return llen - rlen;
+ return diff;
+ }
+ // same length, same type
+ l++; r++;
+ int res = memcmp(l, r, llen);
+ if( res )
+ return res;
+ l += llen; r += llen;
+ break;
+ }
+ case cdate:
+ {
+ long long L = *((long long *) l);
+ long long R = *((long long *) r);
+ if( L < R )
+ return -1;
+ if( L > R )
+ return 1;
+ l += 8; r += 8;
+ break;
+ }
+ case coid:
+ {
+ int res = memcmp(l, r, sizeof(OID));
+ if( res )
+ return res;
+ l += 12; r += 12;
+ break;
+ }
+ default:
+ // all the others are a match -- e.g. null == null
+ ;
+ }
+
+ return 0;
+ }
+
+ // at least one of this and right are traditional BSON format
+ int NOINLINE_DECL KeyV1::compareHybrid(const KeyV1& right, const Ordering& order) const {
+ BSONObj L = toBson();
+ BSONObj R = right.toBson();
+ return L.woCompare(R, order, /*considerfieldname*/false);
+ }
+
+ int KeyV1::woCompare(const KeyV1& right, const Ordering &order) const {
+ const unsigned char *l = _keyData;
+ const unsigned char *r = right._keyData;
+
+ if( (*l|*r) == IsBSON ) // only can do this if cNOTUSED maintained
+ return compareHybrid(right, order);
+
+ unsigned mask = 1;
+ while( 1 ) {
+ char lval = *l;
+ char rval = *r;
+ {
+ int x = compare(l, r); // updates l and r pointers
+ if( x ) {
+ if( order.descending(mask) )
+ x = -x;
+ return x;
+ }
+ }
+
+ {
+ int x = ((int)(lval & cHASMORE)) - ((int)(rval & cHASMORE));
+ if( x )
+ return x;
+ if( (lval & cHASMORE) == 0 )
+ break;
+ }
+
+ mask <<= 1;
+ }
+
+ return 0;
+ }
+
+ static unsigned sizes[] = {
+ 0,
+ 1, //cminkey=1,
+ 1, //cnull=2,
+ 0,
+ 9, //cdouble=4,
+ 0,
+ 0, //cstring=6,
+ 0,
+ 13, //coid=8,
+ 0,
+ 1, //cfalse=10,
+ 1, //ctrue=11,
+ 9, //cdate=12,
+ 0,
+ 1, //cmaxkey=14,
+ 0
+ };
+
+ inline unsigned sizeOfElement(const unsigned char *p) {
+ unsigned type = *p & cCANONTYPEMASK;
+ unsigned sz = sizes[type];
+ if( sz == 0 ) {
+ if( type == cstring ) {
+ sz = ((unsigned) p[1]) + 2;
+ }
+ else {
+ assert( type == cbindata );
+ sz = binDataCodeToLength(p[1]) + 2;
+ }
+ }
+ return sz;
+ }
+
+ int KeyV1::dataSize() const {
+ const unsigned char *p = _keyData;
+ if( !isCompactFormat() ) {
+ return bson().objsize() + 1;
+ }
+
+ bool more;
+ do {
+ unsigned z = sizeOfElement(p);
+ more = (*p & cHASMORE) != 0;
+ p += z;
+ } while( more );
+ return p - _keyData;
+ }
+
+ bool KeyV1::woEqual(const KeyV1& right) const {
+ const unsigned char *l = _keyData;
+ const unsigned char *r = right._keyData;
+
+ if( (*l|*r) == IsBSON ) {
+ return toBson().equal(right.toBson());
+ }
+
+ while( 1 ) {
+ char lval = *l;
+ char rval = *r;
+ if( (lval&(cCANONTYPEMASK|cHASMORE)) != (rval&(cCANONTYPEMASK|cHASMORE)) )
+ return false;
+ l++; r++;
+ switch( lval&cCANONTYPEMASK ) {
+ case coid:
+ if( *((unsigned*) l) != *((unsigned*) r) )
+ return false;
+ l += 4; r += 4;
+ case cdate:
+ if( *((unsigned long long *) l) != *((unsigned long long *) r) )
+ return false;
+ l += 8; r += 8;
+ break;
+ case cdouble:
+ if( *((double *) l) != *((double *) r) )
+ return false;
+ l += 8; r += 8;
+ break;
+ case cstring:
+ {
+ if( *l != *r )
+ return false; // not same length
+ unsigned sz = ((unsigned) *l) + 1;
+ if( memcmp(l, r, sz) )
+ return false;
+ l += sz; r += sz;
+ break;
+ }
+ case cbindata:
+ {
+ if( *l != *r )
+ return false; // len or subtype mismatch
+ int len = binDataCodeToLength(*l) + 1;
+ if( memcmp(l, r, len) )
+ return false;
+ l += len; r += len;
+ break;
+ }
+ case cminkey:
+ case cnull:
+ case cfalse:
+ case ctrue:
+ case cmaxkey:
+ break;
+ default:
+ assert(false);
+ }
+ if( (lval&cHASMORE) == 0 )
+ break;
+ }
+ return true;
+ }
+
+ struct CmpUnitTest : public UnitTest {
+ void run() {
+ char a[2];
+ char b[2];
+ a[0] = -3;
+ a[1] = 0;
+ b[0] = 3;
+ b[1] = 0;
+ assert( strcmp(a,b)>0 && memcmp(a,b,2)>0 );
+ }
+ } cunittest;
+
+}
diff --git a/src/mongo/db/key.h b/src/mongo/db/key.h
new file mode 100644
index 00000000000..9284cdc7422
--- /dev/null
+++ b/src/mongo/db/key.h
@@ -0,0 +1,115 @@
+// @file key.h class(es) representing individual keys in a btree
+
+/**
+* Copyright (C) 2011 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "jsobj.h"
+
+namespace mongo {
+
+ /** Key class for precomputing a small format index key that is denser than a traditional BSONObj.
+
+ KeyBson is a legacy wrapper implementation for old BSONObj style keys for v:0 indexes.
+
+ KeyV1 is the new implementation.
+ */
+ class KeyBson /* "KeyV0" */ {
+ public:
+ KeyBson() { }
+ explicit KeyBson(const char *keyData) : _o(keyData) { }
+ explicit KeyBson(const BSONObj& obj) : _o(obj) { }
+ int woCompare(const KeyBson& r, const Ordering &o) const;
+ BSONObj toBson() const { return _o; }
+ string toString() const { return _o.toString(); }
+ int dataSize() const { return _o.objsize(); }
+ const char * data() const { return _o.objdata(); }
+ BSONElement _firstElement() const { return _o.firstElement(); }
+ bool isCompactFormat() const { return false; }
+ bool woEqual(const KeyBson& r) const;
+ void assign(const KeyBson& rhs) { *this = rhs; }
+ private:
+ BSONObj _o;
+ };
+
+ class KeyV1Owned;
+
+ // corresponding to BtreeData_V1
+ class KeyV1 {
+ void operator=(const KeyV1&); // disallowed just to make people be careful as we don't own the buffer
+ KeyV1(const KeyV1Owned&); // disallowed as this is not a great idea as KeyV1Owned likely will go out of scope
+ public:
+ KeyV1() { _keyData = 0; }
+ ~KeyV1() { DEV _keyData = (const unsigned char *) 1; }
+
+ KeyV1(const KeyV1& rhs) : _keyData(rhs._keyData) {
+ dassert( _keyData > (const unsigned char *) 1 );
+ }
+
+ // explicit version of operator= to be safe
+ void assign(const KeyV1& rhs) {
+ _keyData = rhs._keyData;
+ }
+
+ /** @param keyData can be a buffer containing data in either BSON format, OR in KeyV1 format.
+ when BSON, we are just a wrapper
+ */
+ explicit KeyV1(const char *keyData) : _keyData((unsigned char *) keyData) { }
+
+ int woCompare(const KeyV1& r, const Ordering &o) const;
+ bool woEqual(const KeyV1& r) const;
+ BSONObj toBson() const;
+ string toString() const { return toBson().toString(); }
+
+ /** get the key data we want to store in the btree bucket */
+ const char * data() const { return (const char *) _keyData; }
+
+ /** @return size of data() */
+ int dataSize() const;
+
+ /** only used by geo, which always has bson keys */
+ BSONElement _firstElement() const { return bson().firstElement(); }
+ bool isCompactFormat() const { return *_keyData != IsBSON; }
+ protected:
+ enum { IsBSON = 0xff };
+ const unsigned char *_keyData;
+ BSONObj bson() const {
+ dassert( !isCompactFormat() );
+ return BSONObj((const char *) _keyData+1);
+ }
+ private:
+ int compareHybrid(const KeyV1& right, const Ordering& order) const;
+ };
+
+ class KeyV1Owned : public KeyV1 {
+ void operator=(const KeyV1Owned&);
+ public:
+ /** @obj a BSON object to be translated to KeyV1 format. If the object isn't
+ representable in KeyV1 format (which happens, intentionally, at times)
+ it will stay as bson herein.
+ */
+ KeyV1Owned(const BSONObj& obj);
+
+ /** makes a copy (memcpy's the whole thing) */
+ KeyV1Owned(const KeyV1& rhs);
+
+ private:
+ StackBufBuilder b;
+ void traditional(const BSONObj& obj); // store as traditional bson not as compact format
+ };
+
+};
diff --git a/src/mongo/db/lasterror.cpp b/src/mongo/db/lasterror.cpp
new file mode 100644
index 00000000000..4ed4dfb0571
--- /dev/null
+++ b/src/mongo/db/lasterror.cpp
@@ -0,0 +1,142 @@
+// lasterror.cpp
+
+/* Copyright 2009 10gen Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pch.h"
+
+#include "../util/unittest.h"
+#include "../util/net/message.h"
+
+
+#include "lasterror.h"
+#include "jsobj.h"
+
+namespace mongo {
+
+ LastError LastError::noError;
+ LastErrorHolder lastError;
+
+ bool isShell = false;
+ void raiseError(int code , const char *msg) {
+ LastError *le = lastError.get();
+ if ( le == 0 ) {
+ /* might be intentional (non-user thread) */
+ DEV {
+ static unsigned n;
+ if( ++n < 4 && !isShell ) log() << "dev: lastError==0 won't report:" << msg << endl;
+ }
+ }
+ else if ( le->disabled ) {
+ log() << "lastError disabled, can't report: " << code << ":" << msg << endl;
+ }
+ else {
+ le->raiseError(code, msg);
+ }
+ }
+
+ bool LastError::appendSelf( BSONObjBuilder &b , bool blankErr ) {
+ if ( !valid ) {
+ if ( blankErr )
+ b.appendNull( "err" );
+ b.append( "n", 0 );
+ return false;
+ }
+
+ if ( msg.empty() ) {
+ if ( blankErr ) {
+ b.appendNull( "err" );
+ }
+ }
+ else {
+ b.append( "err", msg );
+ }
+
+ if ( code )
+ b.append( "code" , code );
+ if ( updatedExisting != NotUpdate )
+ b.appendBool( "updatedExisting", updatedExisting == True );
+ if ( upsertedId.isSet() )
+ b.append( "upserted" , upsertedId );
+ if ( writebackId.isSet() ) {
+ b.append( "writeback" , writebackId );
+ b.append( "instanceIdent" , prettyHostName() ); // this can be any unique string
+ }
+ b.appendNumber( "n", nObjects );
+
+ return ! msg.empty();
+ }
+
+ LastErrorHolder::~LastErrorHolder() {
+ }
+
+
+ LastError * LastErrorHolder::disableForCommand() {
+ LastError *le = _get();
+ uassert(13649, "no operation yet", le);
+ le->disabled = true;
+ le->nPrev--; // caller is a command that shouldn't count as an operation
+ return le;
+ }
+
+ LastError * LastErrorHolder::get( bool create ) {
+ LastError *ret = _get( create );
+ if ( ret && !ret->disabled )
+ return ret;
+ return 0;
+ }
+
+ LastError * LastErrorHolder::_get( bool create ) {
+ LastError * le = _tl.get();
+ if ( ! le && create ) {
+ le = new LastError();
+ _tl.reset( le );
+ }
+ return le;
+ }
+
+ void LastErrorHolder::release() {
+ _tl.release();
+ }
+
+ /** ok to call more than once. */
+ void LastErrorHolder::initThread() {
+ if( ! _tl.get() )
+ _tl.reset( new LastError() );
+ }
+
+ void LastErrorHolder::reset( LastError * le ) {
+ _tl.reset( le );
+ }
+
+ void prepareErrForNewRequest( Message &m, LastError * err ) {
+ // a killCursors message shouldn't affect last error
+ assert( err );
+ if ( m.operation() == dbKillCursors ) {
+ err->disabled = true;
+ }
+ else {
+ err->disabled = false;
+ err->nPrev++;
+ }
+ }
+
+ LastError * LastErrorHolder::startRequest( Message& m , LastError * le ) {
+ assert( le );
+ prepareErrForNewRequest( m, le );
+ return le;
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/lasterror.h b/src/mongo/db/lasterror.h
new file mode 100644
index 00000000000..86250e496a8
--- /dev/null
+++ b/src/mongo/db/lasterror.h
@@ -0,0 +1,146 @@
+// lasterror.h
+
+/* Copyright 2009 10gen Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../bson/oid.h"
+
+namespace mongo {
+ class BSONObjBuilder;
+ class Message;
+
+ struct LastError {
+ int code;
+ string msg;
+ enum UpdatedExistingType { NotUpdate, True, False } updatedExisting;
+ OID upsertedId;
+ OID writebackId;
+ long long nObjects;
+ int nPrev;
+ bool valid;
+ bool disabled;
+ void writeback( OID& oid ) {
+ reset( true );
+ writebackId = oid;
+ }
+ void raiseError(int _code , const char *_msg) {
+ reset( true );
+ code = _code;
+ msg = _msg;
+ }
+ void recordUpdate( bool _updateObjects , long long _nObjects , OID _upsertedId ) {
+ reset( true );
+ nObjects = _nObjects;
+ updatedExisting = _updateObjects ? True : False;
+ if ( _upsertedId.isSet() )
+ upsertedId = _upsertedId;
+
+ }
+ void recordDelete( long long nDeleted ) {
+ reset( true );
+ nObjects = nDeleted;
+ }
+ LastError() {
+ reset();
+ }
+ void reset( bool _valid = false ) {
+ code = 0;
+ msg.clear();
+ updatedExisting = NotUpdate;
+ nObjects = 0;
+ nPrev = 1;
+ valid = _valid;
+ disabled = false;
+ upsertedId.clear();
+ writebackId.clear();
+ }
+
+ /**
+ * @return if there is an err
+ */
+ bool appendSelf( BSONObjBuilder &b , bool blankErr = true );
+
+ struct Disabled : boost::noncopyable {
+ Disabled( LastError * le ) {
+ _le = le;
+ if ( _le ) {
+ _prev = _le->disabled;
+ _le->disabled = true;
+ }
+ else {
+ _prev = false;
+ }
+ }
+
+ ~Disabled() {
+ if ( _le )
+ _le->disabled = _prev;
+ }
+
+ LastError * _le;
+ bool _prev;
+ };
+
+ static LastError noError;
+ };
+
+ extern class LastErrorHolder {
+ public:
+ LastErrorHolder(){}
+ ~LastErrorHolder();
+
+ LastError * get( bool create = false );
+ LastError * getSafe() {
+ LastError * le = get(false);
+ if ( ! le ) {
+ error() << " no LastError!" << endl;
+ assert( le );
+ }
+ return le;
+ }
+
+ LastError * _get( bool create = false ); // may return a disabled LastError
+
+ void reset( LastError * le );
+
+ /** ok to call more than once. */
+ void initThread();
+
+ int getID();
+
+ void release();
+
+ /** when db receives a message/request, call this */
+ LastError * startRequest( Message& m , LastError * connectionOwned );
+
+ void disconnect( int clientId );
+
+ // used to disable lastError reporting while processing a killCursors message
+ // disable causes get() to return 0.
+ LastError *disableForCommand(); // only call once per command invocation!
+ private:
+ boost::thread_specific_ptr<LastError> _tl;
+
+ struct Status {
+ time_t time;
+ LastError *lerr;
+ };
+ } lastError;
+
+ void raiseError(int code , const char *msg);
+
+} // namespace mongo
diff --git a/src/mongo/db/matcher.cpp b/src/mongo/db/matcher.cpp
new file mode 100755
index 00000000000..2631845a757
--- /dev/null
+++ b/src/mongo/db/matcher.cpp
@@ -0,0 +1,1128 @@
+// matcher.cpp
+
+/* Matcher is our boolean expression evaluator for "where" clauses */
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "matcher.h"
+#include "../util/goodies.h"
+#include "../util/unittest.h"
+#include "diskloc.h"
+#include "../scripting/engine.h"
+#include "db.h"
+#include "queryutil.h"
+#include "client.h"
+
+#include "pdfile.h"
+
+namespace {
+ inline pcrecpp::RE_Options flags2options(const char* flags) {
+ pcrecpp::RE_Options options;
+ options.set_utf8(true);
+ while ( flags && *flags ) {
+ if ( *flags == 'i' )
+ options.set_caseless(true);
+ else if ( *flags == 'm' )
+ options.set_multiline(true);
+ else if ( *flags == 'x' )
+ options.set_extended(true);
+ else if ( *flags == 's' )
+ options.set_dotall(true);
+ flags++;
+ }
+ return options;
+ }
+}
+
+//#define DEBUGMATCHER(x) cout << x << endl;
+#define DEBUGMATCHER(x)
+
+namespace mongo {
+
+ extern BSONObj staticNull;
+
+ class Where {
+ public:
+ Where() {
+ jsScope = 0;
+ func = 0;
+ }
+ ~Where() {
+
+ if ( scope.get() ){
+ try {
+ scope->execSetup( "_mongo.readOnly = false;" , "make not read only" );
+ }
+ catch( DBException& e ){
+ warning() << "javascript scope cleanup interrupted" << causedBy( e ) << endl;
+ }
+ }
+
+ if ( jsScope ) {
+ delete jsScope;
+ jsScope = 0;
+ }
+ func = 0;
+ }
+
+ auto_ptr<Scope> scope;
+ ScriptingFunction func;
+ BSONObj *jsScope;
+
+ void setFunc(const char *code) {
+ massert( 10341 , "scope has to be created first!" , scope.get() );
+ func = scope->createFunction( code );
+ }
+
+ };
+
+ Matcher::~Matcher() {
+ delete _where;
+ _where = 0;
+ }
+
+ ElementMatcher::ElementMatcher( BSONElement e , int op, bool isNot )
+ : _toMatch( e ) , _compareOp( op ), _isNot( isNot ), _subMatcherOnPrimitives(false) {
+ if ( op == BSONObj::opMOD ) {
+ BSONObj o = e.embeddedObject();
+ _mod = o["0"].numberInt();
+ _modm = o["1"].numberInt();
+
+ uassert( 10073 , "mod can't be 0" , _mod );
+ }
+ else if ( op == BSONObj::opTYPE ) {
+ _type = (BSONType)(e.numberInt());
+ }
+ else if ( op == BSONObj::opELEM_MATCH ) {
+ BSONElement m = e;
+ uassert( 12517 , "$elemMatch needs an Object" , m.type() == Object );
+ BSONObj x = m.embeddedObject();
+ if ( x.firstElement().getGtLtOp() == 0 ) {
+ _subMatcher.reset( new Matcher( x ) );
+ _subMatcherOnPrimitives = false;
+ }
+ else {
+ // meant to act on primitives
+ _subMatcher.reset( new Matcher( BSON( "" << x ) ) );
+ _subMatcherOnPrimitives = true;
+ }
+ }
+ }
+
+ ElementMatcher::ElementMatcher( BSONElement e , int op , const BSONObj& array, bool isNot )
+ : _toMatch( e ) , _compareOp( op ), _isNot( isNot ), _subMatcherOnPrimitives(false) {
+
+ _myset.reset( new set<BSONElement,element_lt>() );
+
+ BSONObjIterator i( array );
+ while ( i.more() ) {
+ BSONElement ie = i.next();
+ if ( op == BSONObj::opALL && ie.type() == Object && ie.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ) {
+ shared_ptr<Matcher> s;
+ s.reset( new Matcher( ie.embeddedObject().firstElement().embeddedObjectUserCheck() ) );
+ _allMatchers.push_back( s );
+ }
+ else if ( ie.type() == RegEx ) {
+ if ( !_myregex.get() ) {
+ _myregex.reset( new vector< RegexMatcher >() );
+ }
+ _myregex->push_back( RegexMatcher() );
+ RegexMatcher &rm = _myregex->back();
+ rm._re.reset( new pcrecpp::RE( ie.regex(), flags2options( ie.regexFlags() ) ) );
+ rm._fieldName = 0; // no need for field name
+ rm._regex = ie.regex();
+ rm._flags = ie.regexFlags();
+ rm._isNot = false;
+ bool purePrefix;
+ string prefix = simpleRegex(rm._regex, rm._flags, &purePrefix);
+ if (purePrefix)
+ rm._prefix = prefix;
+ }
+ else {
+ uassert( 15882, "$elemMatch not allowed within $in",
+ ie.type() != Object ||
+ ie.embeddedObject().firstElement().getGtLtOp() != BSONObj::opELEM_MATCH );
+ _myset->insert(ie);
+ }
+ }
+
+ if ( _allMatchers.size() ) {
+ uassert( 13020 , "with $all, can't mix $elemMatch and others" , _myset->size() == 0 && !_myregex.get());
+ }
+
+ }
+
+ int ElementMatcher::inverseOfNegativeCompareOp() const {
+ verify( 15892, negativeCompareOp() );
+ return _compareOp == BSONObj::NE ? BSONObj::Equality : BSONObj::opIN;
+ }
+
+ bool ElementMatcher::negativeCompareOpContainsNull() const {
+ verify( 15893, negativeCompareOp() );
+ return (_compareOp == BSONObj::NE && _toMatch.type() != jstNULL) ||
+ (_compareOp == BSONObj::NIN && _myset->count( staticNull.firstElement()) == 0 );
+ }
+
+ void Matcher::addRegex(const char *fieldName, const char *regex, const char *flags, bool isNot) {
+
+ RegexMatcher rm;
+ rm._re.reset( new pcrecpp::RE(regex, flags2options(flags)) );
+ rm._fieldName = fieldName;
+ rm._regex = regex;
+ rm._flags = flags;
+ rm._isNot = isNot;
+ _regexs.push_back(rm);
+
+ if (!isNot) { //TODO something smarter
+ bool purePrefix;
+ string prefix = simpleRegex(regex, flags, &purePrefix);
+ if (purePrefix)
+ rm._prefix = prefix;
+ }
+ }
+
+ bool Matcher::addOp( const BSONElement &e, const BSONElement &fe, bool isNot, const char *& regex, const char *&flags ) {
+ const char *fn = fe.fieldName();
+ int op = fe.getGtLtOp( -1 );
+ if ( op == -1 ) {
+ if ( !isNot && fn[1] == 'r' && fn[2] == 'e' && fn[3] == 'f' && fn[4] == 0 ) {
+ return false; // { $ref : xxx } - treat as normal object
+ }
+ uassert( 10068 , (string)"invalid operator: " + fn , op != -1 );
+ }
+
+ switch ( op ) {
+ case BSONObj::GT:
+ case BSONObj::GTE:
+ case BSONObj::LT:
+ case BSONObj::LTE: {
+ shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+ _builders.push_back( b );
+ b->appendAs(fe, e.fieldName());
+ addBasic(b->done().firstElement(), op, isNot);
+ break;
+ }
+ case BSONObj::NE: {
+ _haveNeg = true;
+ shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+ _builders.push_back( b );
+ b->appendAs(fe, e.fieldName());
+ addBasic(b->done().firstElement(), BSONObj::NE, isNot);
+ break;
+ }
+ case BSONObj::opALL:
+ _all = true;
+ case BSONObj::opIN: {
+ uassert( 13276 , "$in needs an array" , fe.isABSONObj() );
+ _basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) );
+ BSONObjIterator i( fe.embeddedObject() );
+ while( i.more() ) {
+ if ( i.next().type() == Array ) {
+ _hasArray = true;
+ }
+ }
+ break;
+ }
+ case BSONObj::NIN:
+ uassert( 13277 , "$nin needs an array" , fe.isABSONObj() );
+ _haveNeg = true;
+ _basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) );
+ break;
+ case BSONObj::opMOD:
+ case BSONObj::opTYPE:
+ case BSONObj::opELEM_MATCH: {
+ shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+ _builders.push_back( b );
+ b->appendAs(fe, e.fieldName());
+ // these are types where ElementMatcher has all the info
+ _basics.push_back( ElementMatcher( b->done().firstElement() , op, isNot ) );
+ break;
+ }
+ case BSONObj::opSIZE: {
+ shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+ _builders.push_back( b );
+ b->appendAs(fe, e.fieldName());
+ addBasic(b->done().firstElement(), BSONObj::opSIZE, isNot);
+ _haveSize = true;
+ break;
+ }
+ case BSONObj::opEXISTS: {
+ shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+ _builders.push_back( b );
+ b->appendAs(fe, e.fieldName());
+ addBasic(b->done().firstElement(), BSONObj::opEXISTS, isNot);
+ break;
+ }
+ case BSONObj::opREGEX: {
+ uassert( 13032, "can't use $not with $regex, use BSON regex type instead", !isNot );
+ if ( fe.type() == RegEx ) {
+ regex = fe.regex();
+ flags = fe.regexFlags();
+ }
+ else {
+ regex = fe.valuestrsafe();
+ }
+ break;
+ }
+ case BSONObj::opOPTIONS: {
+ uassert( 13029, "can't use $not with $options, use BSON regex type instead", !isNot );
+ flags = fe.valuestrsafe();
+ break;
+ }
+ case BSONObj::opNEAR:
+ case BSONObj::opWITHIN:
+ case BSONObj::opMAX_DISTANCE:
+ break;
+ default:
+ uassert( 10069 , (string)"BUG - can't operator for: " + fn , 0 );
+ }
+ return true;
+ }
+
+ void Matcher::parseExtractedClause( const BSONElement &e, list< shared_ptr< Matcher > > &matchers ) {
+ uassert( 13086, "$and/$or/$nor must be a nonempty array", e.type() == Array && e.embeddedObject().nFields() > 0 );
+ BSONObjIterator j( e.embeddedObject() );
+ while( j.more() ) {
+ BSONElement f = j.next();
+ uassert( 13087, "$and/$or/$nor match element must be an object", f.type() == Object );
+ matchers.push_back( shared_ptr< Matcher >( new Matcher( f.embeddedObject(), true ) ) );
+ }
+ }
+
+ bool Matcher::parseClause( const BSONElement &e ) {
+ const char *ef = e.fieldName();
+
+ if ( ef[ 0 ] != '$' )
+ return false;
+
+ // $and
+ if ( ef[ 1 ] == 'a' && ef[ 2 ] == 'n' && ef[ 3 ] == 'd' ) {
+ parseExtractedClause( e, _andMatchers );
+ return true;
+ }
+
+ // $or
+ if ( ef[ 1 ] == 'o' && ef[ 2 ] == 'r' && ef[ 3 ] == 0 ) {
+ parseExtractedClause( e, _orMatchers );
+ return true;
+ }
+
+ // $nor
+ if ( ef[ 1 ] == 'n' && ef[ 2 ] == 'o' && ef[ 3 ] == 'r' && ef[ 4 ] == 0 ) {
+ parseExtractedClause( e, _norMatchers );
+ return true;
+ }
+
+ // $comment
+ if ( ef[ 1 ] == 'c' && ef[ 2 ] == 'o' && ef[ 3 ] == 'm' && str::equals( ef , "$comment" ) ) {
+ return true;
+ }
+
+ return false;
+ }
+
+ // $where: function()...
+ NOINLINE_DECL void Matcher::parseWhere( const BSONElement &e ) {
+ uassert(15902 , "$where expression has an unexpected type", e.type() == String || e.type() == CodeWScope || e.type() == Code );
+ uassert( 10066 , "$where may only appear once in query", _where == 0 );
+ uassert( 10067 , "$where query, but no script engine", globalScriptEngine );
+ massert( 13089 , "no current client needed for $where" , haveClient() );
+ _where = new Where();
+ _where->scope = globalScriptEngine->getPooledScope( cc().ns() );
+ _where->scope->localConnect( cc().database()->name.c_str() );
+
+ if ( e.type() == CodeWScope ) {
+ _where->setFunc( e.codeWScopeCode() );
+ _where->jsScope = new BSONObj( e.codeWScopeScopeData() );
+ }
+ else {
+ const char *code = e.valuestr();
+ _where->setFunc(code);
+ }
+
+ _where->scope->execSetup( "_mongo.readOnly = true;" , "make read only" );
+ }
+
+ void Matcher::parseMatchExpressionElement( const BSONElement &e, bool nested ) {
+
+ uassert( 13629 , "can't have undefined in a query expression" , e.type() != Undefined );
+
+ if ( parseClause( e ) ) {
+ return;
+ }
+
+ const char *fn = e.fieldName();
+ if ( str::equals(fn, "$where") ) {
+ parseWhere(e);
+ return;
+ }
+
+ if ( e.type() == RegEx ) {
+ addRegex( fn, e.regex(), e.regexFlags() );
+ return;
+ }
+
+ // greater than / less than...
+ // e.g., e == { a : { $gt : 3 } }
+ // or
+ // { a : { $in : [1,2,3] } }
+ if ( e.type() == Object ) {
+ // support {$regex:"a|b", $options:"imx"}
+ const char* regex = NULL;
+ const char* flags = "";
+
+ // e.g., fe == { $gt : 3 }
+ BSONObjIterator j(e.embeddedObject());
+ bool isOperator = false;
+ while ( j.more() ) {
+ BSONElement fe = j.next();
+ const char *fn = fe.fieldName();
+
+ if ( fn[0] == '$' && fn[1] ) {
+ isOperator = true;
+
+ if ( fn[1] == 'n' && fn[2] == 'o' && fn[3] == 't' && fn[4] == 0 ) {
+ _haveNeg = true;
+ switch( fe.type() ) {
+ case Object: {
+ BSONObjIterator k( fe.embeddedObject() );
+ uassert( 13030, "$not cannot be empty", k.more() );
+ while( k.more() ) {
+ addOp( e, k.next(), true, regex, flags );
+ }
+ break;
+ }
+ case RegEx:
+ addRegex( e.fieldName(), fe.regex(), fe.regexFlags(), true );
+ break;
+ default:
+ uassert( 13031, "invalid use of $not", false );
+ }
+ }
+ else {
+ if ( !addOp( e, fe, false, regex, flags ) ) {
+ isOperator = false;
+ break;
+ }
+ }
+ }
+ else {
+ isOperator = false;
+ break;
+ }
+ }
+ if (regex) {
+ addRegex(e.fieldName(), regex, flags);
+ }
+ if ( isOperator )
+ return;
+ }
+
+ if ( e.type() == Array ) {
+ _hasArray = true;
+ }
+ else if( *fn == '$' ) {
+ if( str::equals(fn, "$atomic") || str::equals(fn, "$isolated") ) {
+ uassert( 14844, "$atomic specifier must be a top level field", !nested );
+ _atomic = e.trueValue();
+ return;
+ }
+ }
+
+ // normal, simple case e.g. { a : "foo" }
+ addBasic(e, BSONObj::Equality, false);
+ }
+
+ /* _jsobj - the query pattern
+ */
+ Matcher::Matcher(const BSONObj &jsobj, bool nested) :
+ _where(0), _jsobj(jsobj), _haveSize(), _all(), _hasArray(0), _haveNeg(), _atomic(false) {
+
+ BSONObjIterator i(_jsobj);
+ while ( i.more() ) {
+ parseMatchExpressionElement( i.next(), nested );
+ }
+ }
+
+ Matcher::Matcher( const Matcher &docMatcher, const BSONObj &key ) :
+ _where(0), _constrainIndexKey( key ), _haveSize(), _all(), _hasArray(0), _haveNeg(), _atomic(false) {
+ // Filter out match components that will provide an incorrect result
+ // given a key from a single key index.
+ for( vector< ElementMatcher >::const_iterator i = docMatcher._basics.begin(); i != docMatcher._basics.end(); ++i ) {
+ if ( key.hasField( i->_toMatch.fieldName() ) ) {
+ switch( i->_compareOp ) {
+ case BSONObj::opSIZE:
+ case BSONObj::opALL:
+ case BSONObj::NE:
+ case BSONObj::NIN:
+ case BSONObj::opEXISTS: // We can't match on index in this case.
+ case BSONObj::opTYPE: // For $type:10 (null), a null key could be a missing field or a null value field.
+ break;
+ case BSONObj::opIN: {
+ bool inContainsArray = false;
+ for( set<BSONElement,element_lt>::const_iterator j = i->_myset->begin(); j != i->_myset->end(); ++j ) {
+ if ( j->type() == Array ) {
+ inContainsArray = true;
+ break;
+ }
+ }
+ // Can't match an array to its first indexed element.
+ if ( !i->_isNot && !inContainsArray ) {
+ _basics.push_back( *i );
+ }
+ break;
+ }
+ default: {
+ // Can't match an array to its first indexed element.
+ if ( !i->_isNot && i->_toMatch.type() != Array ) {
+ _basics.push_back( *i );
+ }
+ }
+ }
+ }
+ }
+ for( vector<RegexMatcher>::const_iterator it = docMatcher._regexs.begin();
+ it != docMatcher._regexs.end();
+ ++it) {
+ if ( !it->_isNot && key.hasField( it->_fieldName ) ) {
+ _regexs.push_back(*it);
+ }
+ }
+ // Recursively filter match components for and and or matchers.
+ for( list< shared_ptr< Matcher > >::const_iterator i = docMatcher._andMatchers.begin(); i != docMatcher._andMatchers.end(); ++i ) {
+ _andMatchers.push_back( shared_ptr< Matcher >( new Matcher( **i, key ) ) );
+ }
+ for( list< shared_ptr< Matcher > >::const_iterator i = docMatcher._orMatchers.begin(); i != docMatcher._orMatchers.end(); ++i ) {
+ _orMatchers.push_back( shared_ptr< Matcher >( new Matcher( **i, key ) ) );
+ }
+ }
+
+ inline bool regexMatches(const RegexMatcher& rm, const BSONElement& e) {
+ switch (e.type()) {
+ case String:
+ case Symbol:
+ if (rm._prefix.empty())
+ return rm._re->PartialMatch(e.valuestr());
+ else
+ return !strncmp(e.valuestr(), rm._prefix.c_str(), rm._prefix.size());
+ case RegEx:
+ return !strcmp(rm._regex, e.regex()) && !strcmp(rm._flags, e.regexFlags());
+ default:
+ return false;
+ }
+ }
+
+ inline int Matcher::valuesMatch(const BSONElement& l, const BSONElement& r, int op, const ElementMatcher& bm) const {
+ assert( op != BSONObj::NE && op != BSONObj::NIN );
+
+ if ( op == BSONObj::Equality ) {
+ return l.valuesEqual(r);
+ }
+
+ if ( op == BSONObj::opIN ) {
+ // { $in : [1,2,3] }
+ int count = bm._myset->count(l);
+ if ( count )
+ return count;
+ if ( bm._myregex.get() ) {
+ for( vector<RegexMatcher>::const_iterator i = bm._myregex->begin(); i != bm._myregex->end(); ++i ) {
+ if ( regexMatches( *i, l ) ) {
+ return true;
+ }
+ }
+ }
+ }
+
+ if ( op == BSONObj::opSIZE ) {
+ if ( l.type() != Array )
+ return 0;
+ int count = 0;
+ BSONObjIterator i( l.embeddedObject() );
+ while( i.moreWithEOO() ) {
+ BSONElement e = i.next();
+ if ( e.eoo() )
+ break;
+ ++count;
+ }
+ return count == r.number();
+ }
+
+ if ( op == BSONObj::opMOD ) {
+ if ( ! l.isNumber() )
+ return false;
+
+ return l.numberLong() % bm._mod == bm._modm;
+ }
+
+ if ( op == BSONObj::opTYPE ) {
+ return bm._type == l.type();
+ }
+
+ /* check LT, GTE, ... */
+ if ( l.canonicalType() != r.canonicalType() )
+ return false;
+ int c = compareElementValues(l, r);
+ if ( c < -1 ) c = -1;
+ if ( c > 1 ) c = 1;
+ int z = 1 << (c+1);
+ return (op & z);
+ }
+
+ int Matcher::inverseMatch(const char *fieldName, const BSONElement &toMatch, const BSONObj &obj, const ElementMatcher& bm , MatchDetails * details ) const {
+ int inverseRet = matchesDotted( fieldName, toMatch, obj, bm.inverseOfNegativeCompareOp(), bm , false , details );
+ if ( bm.negativeCompareOpContainsNull() ) {
+ return ( inverseRet <= 0 ) ? 1 : 0;
+ }
+ return -inverseRet;
+ }
+
+ int retExistsFound( const ElementMatcher &bm ) {
+ return bm._toMatch.trueValue() ? 1 : -1;
+ }
+
+ /* Check if a particular field matches.
+
+ fieldName - field to match "a.b" if we are reaching into an embedded object.
+ toMatch - element we want to match.
+ obj - database object to check against
+ compareOp - Equality, LT, GT, etc. This may be different than, and should supersede, the compare op in em.
+ isArr -
+
+ Special forms:
+
+ { "a.b" : 3 } means obj.a.b == 3
+ { a : { $lt : 3 } } means obj.a < 3
+ { a : { $in : [1,2] } } means [1,2].contains(obj.a)
+
+ return value
+ -1 mismatch
+ 0 missing element
+ 1 match
+ */
+ int Matcher::matchesDotted(const char *fieldName, const BSONElement& toMatch, const BSONObj& obj, int compareOp, const ElementMatcher& em , bool isArr, MatchDetails * details ) const {
+ DEBUGMATCHER( "\t matchesDotted : " << fieldName << " hasDetails: " << ( details ? "yes" : "no" ) );
+
+ if ( compareOp == BSONObj::opALL ) {
+
+ if ( em._allMatchers.size() ) {
+ // $all query matching will not be performed against indexes, so the field
+ // to match is always extracted from the full document.
+ BSONElement e = obj.getFieldDotted( fieldName );
+ // The $all/$elemMatch operator only matches arrays.
+ if ( e.type() != Array ) {
+ return -1;
+ }
+
+ for ( unsigned i=0; i<em._allMatchers.size(); i++ ) {
+ bool found = false;
+ BSONObjIterator x( e.embeddedObject() );
+ while ( x.more() ) {
+ BSONElement f = x.next();
+
+ if ( f.type() != Object )
+ continue;
+ if ( em._allMatchers[i]->matches( f.embeddedObject() ) ) {
+ found = true;
+ break;
+ }
+ }
+
+ if ( ! found )
+ return -1;
+ }
+
+ return 1;
+ }
+
+ if ( em._myset->size() == 0 && !em._myregex.get() )
+ return -1; // is this desired?
+
+ BSONElementSet myValues;
+ obj.getFieldsDotted( fieldName , myValues );
+
+ for( set< BSONElement, element_lt >::const_iterator i = em._myset->begin(); i != em._myset->end(); ++i ) {
+ // ignore nulls
+ if ( i->type() == jstNULL )
+ continue;
+
+ if ( myValues.count( *i ) == 0 )
+ return -1;
+ }
+
+ if ( !em._myregex.get() )
+ return 1;
+
+ for( vector< RegexMatcher >::const_iterator i = em._myregex->begin(); i != em._myregex->end(); ++i ) {
+ bool match = false;
+ for( BSONElementSet::const_iterator j = myValues.begin(); j != myValues.end(); ++j ) {
+ if ( regexMatches( *i, *j ) ) {
+ match = true;
+ break;
+ }
+ }
+ if ( !match )
+ return -1;
+ }
+
+ return 1;
+ } // end opALL
+
+ if ( compareOp == BSONObj::NE || compareOp == BSONObj::NIN ) {
+ return inverseMatch( fieldName, toMatch, obj, em , details );
+ }
+
+ BSONElement e;
+ bool indexed = !_constrainIndexKey.isEmpty();
+ if ( indexed ) {
+ e = obj.getFieldUsingIndexNames(fieldName, _constrainIndexKey);
+ if( e.eoo() ) {
+ cout << "obj: " << obj << endl;
+ cout << "fieldName: " << fieldName << endl;
+ cout << "_constrainIndexKey: " << _constrainIndexKey << endl;
+ assert( !e.eoo() );
+ }
+ }
+ else {
+
+ const char *p = strchr(fieldName, '.');
+ if ( p ) {
+ string left(fieldName, p-fieldName);
+
+ BSONElement se = obj.getField(left.c_str());
+ if ( se.eoo() )
+ ;
+ else if ( se.type() != Object && se.type() != Array )
+ ;
+ else {
+ BSONObj eo = se.embeddedObject();
+ return matchesDotted(p+1, toMatch, eo, compareOp, em, se.type() == Array , details );
+ }
+ }
+
+ // An array was encountered while scanning for components of the field name.
+ if ( isArr ) {
+ DEBUGMATCHER( "\t\t isArr 1 : obj : " << obj );
+ BSONObjIterator ai(obj);
+ bool found = false;
+ while ( ai.moreWithEOO() ) {
+ BSONElement z = ai.next();
+
+ if( strcmp(z.fieldName(),fieldName) == 0 ) {
+ if ( compareOp == BSONObj::opEXISTS ) {
+ return retExistsFound( em );
+ }
+ if (valuesMatch(z, toMatch, compareOp, em) ) {
+ // "field.<n>" array notation was used
+ if ( details )
+ details->_elemMatchKey = z.fieldName();
+ return 1;
+ }
+ }
+
+ if ( z.type() == Object ) {
+ BSONObj eo = z.embeddedObject();
+ int cmp = matchesDotted(fieldName, toMatch, eo, compareOp, em, false, details );
+ if ( cmp > 0 ) {
+ if ( details )
+ details->_elemMatchKey = z.fieldName();
+ return 1;
+ }
+ else if ( cmp < 0 ) {
+ found = true;
+ }
+ }
+ }
+ return found ? -1 : 0;
+ }
+
+ if( p ) {
+ // Left portion of field name was not found or wrong type.
+ return 0;
+ }
+ else {
+ e = obj.getField(fieldName);
+ }
+ }
+
+ if ( compareOp == BSONObj::opEXISTS ) {
+ if( e.eoo() ) {
+ return 0;
+ } else {
+ return retExistsFound( em );
+ }
+ }
+ else if ( ( e.type() != Array || indexed || compareOp == BSONObj::opSIZE ) &&
+ valuesMatch(e, toMatch, compareOp, em ) ) {
+ return 1;
+ }
+ else if ( e.type() == Array && compareOp != BSONObj::opSIZE ) {
+ BSONObjIterator ai(e.embeddedObject());
+
+ while ( ai.moreWithEOO() ) {
+ BSONElement z = ai.next();
+
+ if ( compareOp == BSONObj::opELEM_MATCH ) {
+ if ( z.type() == Object ) {
+ if ( em._subMatcher->matches( z.embeddedObject() ) ) {
+ if ( details )
+ details->_elemMatchKey = z.fieldName();
+ return 1;
+ }
+ }
+ else if ( em._subMatcherOnPrimitives ) {
+ if ( z.type() && em._subMatcher->matches( z.wrap( "" ) ) ) {
+ if ( details )
+ details->_elemMatchKey = z.fieldName();
+ return 1;
+ }
+ }
+ }
+ else {
+ if ( valuesMatch( z, toMatch, compareOp, em) ) {
+ if ( details )
+ details->_elemMatchKey = z.fieldName();
+ return 1;
+ }
+ }
+
+ }
+
+ // match an entire array to itself
+ if ( compareOp == BSONObj::Equality && e.woCompare( toMatch , false ) == 0 ) {
+ return 1;
+ }
+ if ( compareOp == BSONObj::opIN && valuesMatch( e, toMatch, compareOp, em ) ) {
+ return 1;
+ }
+ }
+ else if ( e.eoo() ) {
+ return 0;
+ }
+ return -1;
+ }
+
+ extern int dump;
+
+ /* See if an object matches the query.
+ */
+ bool Matcher::matches(const BSONObj& jsobj , MatchDetails * details ) const {
+ LOG(5) << "Matcher::matches() " << jsobj.toString() << endl;
+
+ /* assuming there is usually only one thing to match. if more this
+ could be slow sometimes. */
+
+ // check normal non-regex cases:
+ for ( unsigned i = 0; i < _basics.size(); i++ ) {
+ const ElementMatcher& bm = _basics[i];
+ const BSONElement& m = bm._toMatch;
+ // -1=mismatch. 0=missing element. 1=match
+ int cmp = matchesDotted(m.fieldName(), m, jsobj, bm._compareOp, bm , false , details );
+ if ( cmp == 0 && bm._compareOp == BSONObj::opEXISTS ) {
+ // If missing, match cmp is opposite of $exists spec.
+ cmp = -retExistsFound(bm);
+ }
+ if ( bm._isNot )
+ cmp = -cmp;
+ if ( cmp < 0 )
+ return false;
+ if ( cmp == 0 ) {
+ /* missing is ok iff we were looking for null */
+ if ( m.type() == jstNULL || m.type() == Undefined ||
+ ( ( bm._compareOp == BSONObj::opIN || bm._compareOp == BSONObj::NIN ) && bm._myset->count( staticNull.firstElement() ) > 0 ) ) {
+ if ( bm.negativeCompareOp() ^ bm._isNot ) {
+ return false;
+ }
+ }
+ else {
+ if ( !bm._isNot ) {
+ return false;
+ }
+ }
+ }
+ }
+
+ for (vector<RegexMatcher>::const_iterator it = _regexs.begin();
+ it != _regexs.end();
+ ++it) {
+ BSONElementSet s;
+ if ( !_constrainIndexKey.isEmpty() ) {
+ BSONElement e = jsobj.getFieldUsingIndexNames(it->_fieldName, _constrainIndexKey);
+
+ // Should only have keys nested one deep here, for geo-indices
+ // TODO: future indices may nest deeper?
+ if( e.type() == Array ){
+ BSONObjIterator i( e.Obj() );
+ while( i.more() ){
+ s.insert( i.next() );
+ }
+ }
+ else if ( !e.eoo() )
+ s.insert( e );
+
+ }
+ else {
+ jsobj.getFieldsDotted( it->_fieldName, s );
+ }
+ bool match = false;
+ for( BSONElementSet::const_iterator i = s.begin(); i != s.end(); ++i )
+ if ( regexMatches(*it, *i) )
+ match = true;
+ if ( !match ^ it->_isNot )
+ return false;
+ }
+
+ if ( _orDedupConstraints.size() > 0 ) {
+ for( vector< shared_ptr< FieldRangeVector > >::const_iterator i = _orDedupConstraints.begin();
+ i != _orDedupConstraints.end(); ++i ) {
+ if ( (*i)->matches( jsobj ) ) {
+ return false;
+ }
+ }
+ }
+
+ if ( _andMatchers.size() > 0 ) {
+ for( list< shared_ptr< Matcher > >::const_iterator i = _andMatchers.begin();
+ i != _andMatchers.end(); ++i ) {
+ // SERVER-3192 Track field matched using details the same as for
+ // top level fields, at least for now.
+ if ( !(*i)->matches( jsobj, details ) ) {
+ return false;
+ }
+ }
+ }
+
+ if ( _orMatchers.size() > 0 ) {
+ bool match = false;
+ for( list< shared_ptr< Matcher > >::const_iterator i = _orMatchers.begin();
+ i != _orMatchers.end(); ++i ) {
+ // SERVER-205 don't submit details - we don't want to track field
+ // matched within $or
+ if ( (*i)->matches( jsobj ) ) {
+ match = true;
+ break;
+ }
+ }
+ if ( !match ) {
+ return false;
+ }
+ }
+
+ if ( _norMatchers.size() > 0 ) {
+ for( list< shared_ptr< Matcher > >::const_iterator i = _norMatchers.begin();
+ i != _norMatchers.end(); ++i ) {
+ // SERVER-205 don't submit details - we don't want to track field
+ // matched within $nor
+ if ( (*i)->matches( jsobj ) ) {
+ return false;
+ }
+ }
+ }
+
+ if ( _where ) {
+ if ( _where->func == 0 ) {
+ uassert( 10070 , "$where compile error", false);
+ return false; // didn't compile
+ }
+
+ if ( _where->jsScope ) {
+ _where->scope->init( _where->jsScope );
+ }
+ _where->scope->setObject( "obj", const_cast< BSONObj & >( jsobj ) );
+ _where->scope->setBoolean( "fullObject" , true ); // this is a hack b/c fullObject used to be relevant
+
+ int err = _where->scope->invoke( _where->func , 0, &jsobj , 1000 * 60 , false );
+ if ( err == -3 ) { // INVOKE_ERROR
+ stringstream ss;
+ ss << "error on invocation of $where function:\n"
+ << _where->scope->getError();
+ uassert( 10071 , ss.str(), false);
+ return false;
+ }
+ else if ( err != 0 ) { // ! INVOKE_SUCCESS
+ uassert( 10072 , "unknown error in invocation of $where function", false);
+ return false;
+ }
+ return _where->scope->getBoolean( "return" ) != 0;
+
+ }
+
+ return true;
+ }
+
+ bool Matcher::keyMatch( const Matcher &docMatcher ) const {
+ // Quick check certain non key match cases.
+ if ( docMatcher._all
+ || docMatcher._haveSize
+ || docMatcher._hasArray // We can't match an array to its first indexed element using keymatch
+ || docMatcher._haveNeg ) {
+ return false;
+ }
+
+ // Check that all match components are available in the index matcher.
+ if ( !( _basics.size() == docMatcher._basics.size() && _regexs.size() == docMatcher._regexs.size() && !docMatcher._where ) ) {
+ return false;
+ }
+ if ( _andMatchers.size() != docMatcher._andMatchers.size() ) {
+ return false;
+ }
+ if ( _orMatchers.size() != docMatcher._orMatchers.size() ) {
+ return false;
+ }
+ if ( docMatcher._norMatchers.size() > 0 ) {
+ return false;
+ }
+ if ( docMatcher._orDedupConstraints.size() > 0 ) {
+ return false;
+ }
+
+ // Recursively check that all submatchers support key match.
+ {
+ list< shared_ptr< Matcher > >::const_iterator i = _andMatchers.begin();
+ list< shared_ptr< Matcher > >::const_iterator j = docMatcher._andMatchers.begin();
+ while( i != _andMatchers.end() ) {
+ if ( !(*i)->keyMatch( **j ) ) {
+ return false;
+ }
+ ++i; ++j;
+ }
+ }
+ {
+ list< shared_ptr< Matcher > >::const_iterator i = _orMatchers.begin();
+ list< shared_ptr< Matcher > >::const_iterator j = docMatcher._orMatchers.begin();
+ while( i != _orMatchers.end() ) {
+ if ( !(*i)->keyMatch( **j ) ) {
+ return false;
+ }
+ ++i; ++j;
+ }
+ }
+ // Nor matchers and or dedup constraints aren't created for index matchers,
+ // so no need to check those here.
+ return true;
+ }
+
+
+ /*- just for testing -- */
+#pragma pack(1)
+ struct JSObj1 {
+ JSObj1() {
+ totsize=sizeof(JSObj1);
+ n = NumberDouble;
+ strcpy_s(nname, 5, "abcd");
+ N = 3.1;
+ s = String;
+ strcpy_s(sname, 7, "abcdef");
+ slen = 10;
+ strcpy_s(sval, 10, "123456789");
+ eoo = EOO;
+ }
+ unsigned totsize;
+
+ char n;
+ char nname[5];
+ double N;
+
+ char s;
+ char sname[7];
+ unsigned slen;
+ char sval[10];
+
+ char eoo;
+ };
+#pragma pack()
+
+ struct JSObj1 js1;
+
+#pragma pack(1)
+ struct JSObj2 {
+ JSObj2() {
+ totsize=sizeof(JSObj2);
+ s = String;
+ strcpy_s(sname, 7, "abcdef");
+ slen = 10;
+ strcpy_s(sval, 10, "123456789");
+ eoo = EOO;
+ }
+ unsigned totsize;
+ char s;
+ char sname[7];
+ unsigned slen;
+ char sval[10];
+ char eoo;
+ } js2;
+
+ struct JSUnitTest : public UnitTest {
+ void run() {
+
+ BSONObj j1((const char *) &js1);
+ BSONObj j2((const char *) &js2);
+ Matcher m(j2);
+ assert( m.matches(j1) );
+ js2.sval[0] = 'z';
+ assert( !m.matches(j1) );
+ Matcher n(j1);
+ assert( n.matches(j1) );
+ assert( !n.matches(j2) );
+
+ BSONObj j0 = BSONObj();
+// BSONObj j0((const char *) &js0);
+ Matcher p(j0);
+ assert( p.matches(j1) );
+ assert( p.matches(j2) );
+ }
+ } jsunittest;
+
+#pragma pack()
+
+ struct RXTest : public UnitTest {
+
+ RXTest() {
+ }
+
+ void run() {
+ /*
+ static const boost::regex e("(\\d{4}[- ]){3}\\d{4}");
+ static const boost::regex b(".....");
+ out() << "regex result: " << regex_match("hello", e) << endl;
+ out() << "regex result: " << regex_match("abcoo", b) << endl;
+ */
+
+ int ret = 0;
+
+ pcre_config( PCRE_CONFIG_UTF8 , &ret );
+ massert( 10342 , "pcre not compiled with utf8 support" , ret );
+
+ pcrecpp::RE re1(")({a}h.*o");
+ pcrecpp::RE re("h.llo");
+ assert( re.FullMatch("hello") );
+ assert( !re1.FullMatch("hello") );
+
+
+ pcrecpp::RE_Options options;
+ options.set_utf8(true);
+ pcrecpp::RE part("dwi", options);
+ assert( part.PartialMatch("dwight") );
+
+ pcre_config( PCRE_CONFIG_UNICODE_PROPERTIES , &ret );
+ if ( ! ret )
+ cout << "warning: some regex utf8 things will not work. pcre build doesn't have --enable-unicode-properties" << endl;
+
+ }
+ } rxtest;
+
+} // namespace mongo
diff --git a/src/mongo/db/matcher.h b/src/mongo/db/matcher.h
new file mode 100644
index 00000000000..b6994a79229
--- /dev/null
+++ b/src/mongo/db/matcher.h
@@ -0,0 +1,276 @@
+// matcher.h
+
+/* Matcher is our boolean expression evaluator for "where" clauses */
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "jsobj.h"
+#include "pcrecpp.h"
+
+namespace mongo {
+
+ class Cursor;
+ class CoveredIndexMatcher;
+ class Matcher;
+ class FieldRangeVector;
+
+ class RegexMatcher {
+ public:
+ const char *_fieldName;
+ const char *_regex;
+ const char *_flags;
+ string _prefix;
+ shared_ptr< pcrecpp::RE > _re;
+ bool _isNot;
+ RegexMatcher() : _isNot() {}
+ };
+
+ struct element_lt {
+ bool operator()(const BSONElement& l, const BSONElement& r) const {
+ int x = (int) l.canonicalType() - (int) r.canonicalType();
+ if ( x < 0 ) return true;
+ else if ( x > 0 ) return false;
+ return compareElementValues(l,r) < 0;
+ }
+ };
+
+
+ class ElementMatcher {
+ public:
+
+ ElementMatcher() {
+ }
+
+ ElementMatcher( BSONElement e , int op, bool isNot );
+
+ ElementMatcher( BSONElement e , int op , const BSONObj& array, bool isNot );
+
+ ~ElementMatcher() { }
+
+ bool negativeCompareOp() const { return _compareOp == BSONObj::NE || _compareOp == BSONObj::NIN; }
+ int inverseOfNegativeCompareOp() const;
+ bool negativeCompareOpContainsNull() const;
+
+ BSONElement _toMatch;
+ int _compareOp;
+ bool _isNot;
+ shared_ptr< set<BSONElement,element_lt> > _myset;
+ shared_ptr< vector<RegexMatcher> > _myregex;
+
+ // these are for specific operators
+ int _mod;
+ int _modm;
+ BSONType _type;
+
+ shared_ptr<Matcher> _subMatcher;
+ bool _subMatcherOnPrimitives ;
+
+ vector< shared_ptr<Matcher> > _allMatchers;
+ };
+
+ class Where; // used for $where javascript eval
+ class DiskLoc;
+
+ struct MatchDetails {
+ MatchDetails() {
+ reset();
+ }
+
+ void reset() {
+ _loadedObject = false;
+ _elemMatchKey = 0;
+ }
+
+ string toString() const {
+ stringstream ss;
+ ss << "loadedObject: " << _loadedObject << " ";
+ ss << "elemMatchKey: " << ( _elemMatchKey ? _elemMatchKey : "NULL" ) << " ";
+ return ss.str();
+ }
+
+ bool _loadedObject;
+ const char * _elemMatchKey; // warning, this may go out of scope if matched object does
+ };
+
+ /* Match BSON objects against a query pattern.
+
+ e.g.
+ db.foo.find( { a : 3 } );
+
+ { a : 3 } is the pattern object. See wiki documentation for full info.
+
+ GT/LT:
+ { a : { $gt : 3 } }
+ Not equal:
+ { a : { $ne : 3 } }
+
+ TODO: we should rewrite the matcher to be more an AST style.
+ */
+ class Matcher : boost::noncopyable {
+ int matchesDotted(
+ const char *fieldName,
+ const BSONElement& toMatch, const BSONObj& obj,
+ int compareOp, const ElementMatcher& bm, bool isArr , MatchDetails * details ) const;
+
+ /**
+ * Perform a NE or NIN match by returning the inverse of the opposite matching operation.
+ * Missing values are considered matches unless the match must not equal null.
+ */
+ int inverseMatch(
+ const char *fieldName,
+ const BSONElement &toMatch, const BSONObj &obj,
+ const ElementMatcher&bm, MatchDetails * details ) const;
+
+ public:
+ static int opDirection(int op) {
+ return op <= BSONObj::LTE ? -1 : 1;
+ }
+
+ Matcher(const BSONObj &pattern, bool nested=false);
+
+ ~Matcher();
+
+ bool matches(const BSONObj& j, MatchDetails * details = 0 ) const;
+
+ bool atomic() const { return _atomic; }
+
+ string toString() const {
+ return _jsobj.toString();
+ }
+
+ void addOrDedupConstraint( const shared_ptr< FieldRangeVector > &frv ) {
+ _orDedupConstraints.push_back( frv );
+ }
+
+ void popOrClause() {
+ _orMatchers.pop_front();
+ }
+
+ /**
+ * @return true if this key matcher will return the same true/false
+ * value as the provided doc matcher.
+ */
+ bool keyMatch( const Matcher &docMatcher ) const;
+
+ bool singleSimpleCriterion() const {
+ return false; // TODO SERVER-958
+// // TODO Really check, especially if all basics are ok.
+// // $all, etc
+// // _orConstraints?
+// return ( ( basics.size() + nRegex ) < 2 ) && !where && !_orMatchers.size() && !_norMatchers.size();
+ }
+
+ const BSONObj *getQuery() const { return &_jsobj; };
+
+ private:
+ /**
+ * Generate a matcher for the provided index key format using the
+ * provided full doc matcher.
+ */
+ Matcher( const Matcher &docMatcher, const BSONObj &constrainIndexKey );
+
+ void addBasic(const BSONElement &e, int c, bool isNot) {
+ // TODO May want to selectively ignore these element types based on op type.
+ if ( e.type() == MinKey || e.type() == MaxKey )
+ return;
+ _basics.push_back( ElementMatcher( e , c, isNot ) );
+ }
+
+ void addRegex(const char *fieldName, const char *regex, const char *flags, bool isNot = false);
+ bool addOp( const BSONElement &e, const BSONElement &fe, bool isNot, const char *& regex, const char *&flags );
+
+ int valuesMatch(const BSONElement& l, const BSONElement& r, int op, const ElementMatcher& bm) const;
+
+ bool parseClause( const BSONElement &e );
+ void parseExtractedClause( const BSONElement &e, list< shared_ptr< Matcher > > &matchers );
+
+ void parseWhere( const BSONElement &e );
+ void parseMatchExpressionElement( const BSONElement &e, bool nested );
+
+ Where *_where; // set if query uses $where
+ BSONObj _jsobj; // the query pattern. e.g., { name: "joe" }
+ BSONObj _constrainIndexKey;
+ vector<ElementMatcher> _basics;
+ bool _haveSize;
+ bool _all;
+ bool _hasArray;
+ bool _haveNeg;
+
+ /* $atomic - if true, a multi document operation (some removes, updates)
+ should be done atomically. in that case, we do not yield -
+ i.e. we stay locked the whole time.
+ http://www.mongodb.org/display/DOCS/Removing[
+ */
+ bool _atomic;
+
+ vector<RegexMatcher> _regexs;
+
+ // so we delete the mem when we're done:
+ vector< shared_ptr< BSONObjBuilder > > _builders;
+ list< shared_ptr< Matcher > > _andMatchers;
+ list< shared_ptr< Matcher > > _orMatchers;
+ list< shared_ptr< Matcher > > _norMatchers;
+ vector< shared_ptr< FieldRangeVector > > _orDedupConstraints;
+
+ friend class CoveredIndexMatcher;
+ };
+
+ // If match succeeds on index key, then attempt to match full document.
+ class CoveredIndexMatcher : boost::noncopyable {
+ public:
+ CoveredIndexMatcher(const BSONObj &pattern, const BSONObj &indexKeyPattern , bool alwaysUseRecord=false );
+ bool matches(const BSONObj &o) { return _docMatcher->matches( o ); }
+ bool matchesWithSingleKeyIndex(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details = 0 ) {
+ return matches( key, recLoc, details, true );
+ }
+ /**
+ * This is the preferred method for matching against a cursor, as it
+ * can handle both multi and single key cursors.
+ */
+ bool matchesCurrent( Cursor * cursor , MatchDetails * details = 0 );
+ bool needRecord() { return _needRecord; }
+
+ Matcher& docMatcher() { return *_docMatcher; }
+
+ // once this is called, shouldn't use this matcher for matching any more
+ void advanceOrClause( const shared_ptr< FieldRangeVector > &frv ) {
+ _docMatcher->addOrDedupConstraint( frv );
+ // TODO this is not yet optimal. Since we could skip an entire
+ // or clause (if a match is impossible) between calls to advanceOrClause()
+ // we may not pop all the clauses we can.
+ _docMatcher->popOrClause();
+ }
+
+ CoveredIndexMatcher *nextClauseMatcher( const BSONObj &indexKeyPattern, bool alwaysUseRecord=false ) {
+ return new CoveredIndexMatcher( _docMatcher, indexKeyPattern, alwaysUseRecord );
+ }
+
+ string toString() const;
+
+ private:
+ bool matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details = 0 , bool keyUsable = true );
+ CoveredIndexMatcher(const shared_ptr< Matcher > &docMatcher, const BSONObj &indexKeyPattern , bool alwaysUseRecord=false );
+ void init( bool alwaysUseRecord );
+ shared_ptr< Matcher > _docMatcher;
+ Matcher _keyMatcher;
+
+ bool _needRecord; // if the key itself isn't good enough to determine a positive match
+ };
+
+} // namespace mongo
diff --git a/src/mongo/db/matcher_covered.cpp b/src/mongo/db/matcher_covered.cpp
new file mode 100644
index 00000000000..c6c89d03007
--- /dev/null
+++ b/src/mongo/db/matcher_covered.cpp
@@ -0,0 +1,101 @@
+// matcher_covered.cpp
+
+/* Matcher is our boolean expression evaluator for "where" clauses */
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "matcher.h"
+#include "../util/goodies.h"
+#include "../util/unittest.h"
+#include "diskloc.h"
+#include "../scripting/engine.h"
+#include "db.h"
+#include "client.h"
+
+#include "pdfile.h"
+
+namespace mongo {
+
+ CoveredIndexMatcher::CoveredIndexMatcher( const BSONObj &jsobj, const BSONObj &indexKeyPattern, bool alwaysUseRecord) :
+ _docMatcher( new Matcher( jsobj ) ),
+ _keyMatcher( *_docMatcher, indexKeyPattern ) {
+ init( alwaysUseRecord );
+ }
+
+ CoveredIndexMatcher::CoveredIndexMatcher( const shared_ptr< Matcher > &docMatcher, const BSONObj &indexKeyPattern , bool alwaysUseRecord ) :
+ _docMatcher( docMatcher ),
+ _keyMatcher( *_docMatcher, indexKeyPattern ) {
+ init( alwaysUseRecord );
+ }
+
+ void CoveredIndexMatcher::init( bool alwaysUseRecord ) {
+ _needRecord =
+ alwaysUseRecord ||
+ !_keyMatcher.keyMatch( *_docMatcher );
+ }
+
+ bool CoveredIndexMatcher::matchesCurrent( Cursor * cursor , MatchDetails * details ) {
+ // bool keyUsable = ! cursor->isMultiKey() && check for $orish like conditions in matcher SERVER-1264
+ return matches( cursor->currKey() , cursor->currLoc() , details ,
+ !cursor->indexKeyPattern().isEmpty() // unindexed cursor
+ && !cursor->isMultiKey() // multikey cursor
+ );
+ }
+
+ bool CoveredIndexMatcher::matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details , bool keyUsable ) {
+
+ LOG(5) << "CoveredIndexMatcher::matches() " << key.toString() << ' ' << recLoc.toString() << ' ' << keyUsable << endl;
+
+ dassert( key.isValid() );
+
+ if ( details )
+ details->reset();
+
+ if ( keyUsable ) {
+ if ( !_keyMatcher.matches(key, details ) ) {
+ return false;
+ }
+ if ( ! _needRecord ) {
+ return true;
+ }
+ }
+
+ if ( details )
+ details->_loadedObject = true;
+
+ bool res = _docMatcher->matches(recLoc.obj() , details );
+ LOG(5) << "CoveredIndexMatcher _docMatcher->matches() returns " << res << endl;
+ return res;
+ }
+
+ string CoveredIndexMatcher::toString() const {
+ StringBuilder buf;
+ buf << "(CoveredIndexMatcher ";
+
+ if ( _needRecord )
+ buf << "needRecord ";
+
+ buf << "keyMatcher: " << _keyMatcher.toString() << " ";
+
+ if ( _docMatcher )
+ buf << "docMatcher: " << _docMatcher->toString() << " ";
+
+ buf << ")";
+ return buf.str();
+ }
+}
diff --git a/src/mongo/db/minilex.h b/src/mongo/db/minilex.h
new file mode 100644
index 00000000000..677514aa47c
--- /dev/null
+++ b/src/mongo/db/minilex.h
@@ -0,0 +1,164 @@
+// minilex.h
+// mini js lexical analyzer. idea is to be dumb and fast.
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#error does anything use this?
+
+namespace mongo {
+
+#if defined(_WIN32)
+
+} // namespace mongo
+
+#include <hash_map>
+using namespace stdext;
+
+namespace mongo {
+
+ typedef const char * MyStr;
+ struct less_str {
+ bool operator()(const MyStr & x, const MyStr & y) const {
+ if ( strcmp(x, y) > 0)
+ return true;
+
+ return false;
+ }
+ };
+
+ typedef hash_map<const char*, int, hash_compare<const char *, less_str> > strhashmap;
+
+#else
+
+} // namespace mongo
+
+#include <ext/hash_map>
+
+namespace mongo {
+
+ using namespace __gnu_cxx;
+
+ typedef const char * MyStr;
+ struct eq_str {
+ bool operator()(const MyStr & x, const MyStr & y) const {
+ if ( strcmp(x, y) == 0)
+ return true;
+
+ return false;
+ }
+ };
+
+ typedef hash_map<const char*, int, hash<const char *>, eq_str > strhashmap;
+
+#endif
+
+ /*
+ struct MiniLexNotUsed {
+ strhashmap reserved;
+ bool ic[256]; // ic=Identifier Character
+ bool starter[256];
+
+ // dm: very dumb about comments and escaped quotes -- but we are faster then at least,
+ // albeit returning too much (which is ok for jsbobj current usage).
+ void grabVariables(char *code , strhashmap& vars) { // 'code' modified and must stay in scope*/
+ char *p = code;
+ char last = 0;
+ while ( *p ) {
+ if ( starter[*p] ) {
+ char *q = p+1;
+ while ( *q && ic[*q] ) q++;
+ const char *identifier = p;
+ bool done = *q == 0;
+ *q = 0;
+ if ( !reserved.count(identifier) ) {
+ // we try to be smart about 'obj' but have to be careful as obj.obj
+ // can happen; this is so that nFields is right for simplistic where cases
+ // so we can stop scanning in jsobj when we find the field of interest.
+ if ( strcmp(identifier,"obj")==0 && p>code && p[-1] != '.' )
+ ;
+ else
+ vars[identifier] = 1;
+ }
+ if ( done )
+ break;
+ p = q + 1;
+ continue;
+ }
+
+ if ( *p == '\'' ) {
+ p++;
+ while ( *p && *p != '\'' ) p++;
+ }
+ else if ( *p == '"' ) {
+ p++;
+ while ( *p && *p != '"' ) p++;
+ }
+ p++;
+ }
+}
+
+MiniLex() {
+ strhashmap atest;
+ atest["foo"] = 3;
+ assert( atest.count("bar") == 0 );
+ assert( atest.count("foo") == 1 );
+ assert( atest["foo"] == 3 );
+
+ for ( int i = 0; i < 256; i++ ) {
+ ic[i] = starter[i] = false;
+ }
+ for ( int i = 'a'; i <= 'z'; i++ )
+ ic[i] = starter[i] = true;
+ for ( int i = 'A'; i <= 'Z'; i++ )
+ ic[i] = starter[i] = true;
+ for ( int i = '0'; i <= '9'; i++ )
+ ic[i] = true;
+ for ( int i = 128; i < 256; i++ )
+ ic[i] = starter[i] = true;
+ ic['$'] = starter['$'] = true;
+ ic['_'] = starter['_'] = true;
+
+ reserved["break"] = true;
+ reserved["case"] = true;
+ reserved["catch"] = true;
+ reserved["continue"] = true;
+ reserved["default"] = true;
+ reserved["delete"] = true;
+ reserved["do"] = true;
+ reserved["else"] = true;
+ reserved["finally"] = true;
+ reserved["for"] = true;
+ reserved["function"] = true;
+ reserved["if"] = true;
+ reserved["in"] = true;
+ reserved["instanceof"] = true;
+ reserved["new"] = true;
+ reserved["return"] = true;
+ reserved["switch"] = true;
+ reserved["this"] = true;
+ reserved["throw"] = true;
+ reserved["try"] = true;
+ reserved["typeof"] = true;
+ reserved["var"] = true;
+ reserved["void"] = true;
+ reserved["while"] = true;
+ reserved["with "] = true;
+}
+};
+*/
+
+} // namespace mongo
diff --git a/src/mongo/db/module.cpp b/src/mongo/db/module.cpp
new file mode 100644
index 00000000000..4269c5e99a0
--- /dev/null
+++ b/src/mongo/db/module.cpp
@@ -0,0 +1,68 @@
+// module.cpp
+/*
+ * Copyright (C) 2010 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "pch.h"
+#include "module.h"
+
+namespace mongo {
+
+ std::list<Module*> * Module::_all;
+
+ Module::Module( const string& name )
+ : _name( name ) , _options( (string)"Module " + name + " options" ) {
+ if ( ! _all )
+ _all = new list<Module*>();
+ _all->push_back( this );
+ }
+
+ Module::~Module() {}
+
+ void Module::addOptions( boost::program_options::options_description& options ) {
+ if ( ! _all ) {
+ return;
+ }
+ for ( list<Module*>::iterator i=_all->begin(); i!=_all->end(); i++ ) {
+ Module* m = *i;
+ options.add( m->_options );
+ }
+ }
+
+ void Module::configAll( boost::program_options::variables_map& params ) {
+ if ( ! _all ) {
+ return;
+ }
+ for ( list<Module*>::iterator i=_all->begin(); i!=_all->end(); i++ ) {
+ Module* m = *i;
+ m->config( params );
+ }
+
+ }
+
+
+ void Module::initAll() {
+ if ( ! _all ) {
+ return;
+ }
+ for ( list<Module*>::iterator i=_all->begin(); i!=_all->end(); i++ ) {
+ Module* m = *i;
+ m->init();
+ }
+
+ }
+
+}
diff --git a/src/mongo/db/module.h b/src/mongo/db/module.h
new file mode 100644
index 00000000000..71f276e0585
--- /dev/null
+++ b/src/mongo/db/module.h
@@ -0,0 +1,70 @@
+// module.h
+
+/**
+* Copyright (C) 2008 10gen Inc.info
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include <boost/program_options.hpp>
+#include <list>
+
+namespace mongo {
+
+ /**
+ * Module is the base class for adding modules to MongoDB
+ * modules allow adding hooks and features to mongo
+ * the idea is to add hooks into the main code for module support where needed
+ * some ideas are: monitoring, indexes, full text search
+ */
+ class Module {
+ public:
+ Module( const string& name );
+ virtual ~Module();
+
+ boost::program_options::options_description_easy_init add_options() {
+ return _options.add_options();
+ }
+
+ /**
+ * read config from command line
+ */
+ virtual void config( boost::program_options::variables_map& params ) = 0;
+
+ /**
+ * called after configuration when the server is ready start
+ */
+ virtual void init() = 0;
+
+ /**
+ * called when the database is about to shutdown
+ */
+ virtual void shutdown() = 0;
+
+ const string& getName() { return _name; }
+
+ // --- static things
+
+ static void addOptions( boost::program_options::options_description& options );
+ static void configAll( boost::program_options::variables_map& params );
+ static void initAll();
+
+ private:
+ static std::list<Module*> * _all;
+ string _name;
+ boost::program_options::options_description _options;
+ };
+}
diff --git a/src/mongo/db/modules/mms.cpp b/src/mongo/db/modules/mms.cpp
new file mode 100644
index 00000000000..418a553f283
--- /dev/null
+++ b/src/mongo/db/modules/mms.cpp
@@ -0,0 +1,170 @@
+// @file mms.cpp
+/*
+ * Copyright (C) 2010 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "pch.h"
+#include "../db.h"
+#include "../instance.h"
+#include "../module.h"
+#include "../../util/net/httpclient.h"
+#include "../../util/background.h"
+#include "../commands.h"
+
+namespace po = boost::program_options;
+
+namespace mongo {
+
+ /** Mongo Monitoring Service
+ if enabled, this runs in the background ands pings mss
+ */
+ class MMS : public BackgroundJob , Module {
+ public:
+
+ MMS()
+ : Module( "mms" ) , _baseurl( "" ) ,
+ _secsToSleep(1) , _token( "" ) , _name( "" ) {
+
+ add_options()
+ ( "mms-url" , po::value<string>()->default_value("http://mms.10gen.com/ping") , "url for mongo monitoring server" )
+ ( "mms-token" , po::value<string>() , "account token for mongo monitoring server" )
+ ( "mms-name" , po::value<string>() , "server name for mongo monitoring server" )
+ ( "mms-interval" , po::value<int>()->default_value(30) , "ping interval (in seconds) for mongo monitoring server" )
+ ;
+ }
+
+ ~MMS() {}
+
+ void config( boost::program_options::variables_map& params ) {
+ _baseurl = params["mms-url"].as<string>();
+ if ( params.count( "mms-token" ) ) {
+ _token = params["mms-token"].as<string>();
+ }
+ if ( params.count( "mms-name" ) ) {
+ _name = params["mms-name"].as<string>();
+ }
+ _secsToSleep = params["mms-interval"].as<int>();
+ }
+
+ void run() {
+ if ( _token.size() == 0 && _name.size() == 0 ) {
+ log(1) << "mms not configured" << endl;
+ return;
+ }
+
+ if ( _token.size() == 0 ) {
+ log() << "no token for mms - not running" << endl;
+ return;
+ }
+
+ if ( _name.size() == 0 ) {
+ log() << "no name for mms - not running" << endl;
+ return;
+ }
+
+ log() << "mms monitor staring... token:" << _token << " name:" << _name << " interval: " << _secsToSleep << endl;
+ Client::initThread( "mms" );
+ Client& c = cc();
+
+
+ // TODO: using direct client is bad, but easy for now
+
+ while ( ! inShutdown() ) {
+ sleepsecs( _secsToSleep );
+
+ try {
+ stringstream url;
+ url << _baseurl << "?"
+ << "token=" << _token << "&"
+ << "name=" << _name << "&"
+ << "ts=" << time(0)
+ ;
+
+ BSONObjBuilder bb;
+ // duplicated so the post has everything
+ bb.append( "token" , _token );
+ bb.append( "name" , _name );
+ bb.appendDate( "ts" , jsTime() );
+
+ // any commands
+ _add( bb , "buildinfo" );
+ _add( bb , "serverStatus" );
+
+ BSONObj postData = bb.obj();
+
+ log(1) << "mms url: " << url.str() << "\n\t post: " << postData << endl;;
+
+ HttpClient c;
+ HttpClient::Result r;
+ int rc = c.post( url.str() , postData.jsonString() , &r );
+ log(1) << "\t response code: " << rc << endl;
+ if ( rc != 200 ) {
+ log() << "mms error response code:" << rc << endl;
+ log(1) << "mms error body:" << r.getEntireResponse() << endl;
+ }
+ }
+ catch ( std::exception& e ) {
+ log() << "mms exception: " << e.what() << endl;
+ }
+ }
+
+ c.shutdown();
+ }
+
+ void _add( BSONObjBuilder& postData , const char* cmd ) {
+ Command * c = Command::findCommand( cmd );
+ if ( ! c ) {
+ log() << "MMS can't find command: " << cmd << endl;
+ postData.append( cmd , "can't find command" );
+ return;
+ }
+
+ if ( c->locktype() ) {
+ log() << "MMS can only use noLocking commands not: " << cmd << endl;
+ postData.append( cmd , "not noLocking" );
+ return;
+ }
+
+ BSONObj co = BSON( cmd << 1 );
+
+ string errmsg;
+ BSONObjBuilder sub;
+ if ( ! c->run( "admin.$cmd" , co , 0 , errmsg , sub , false ) )
+ postData.append( cmd , errmsg );
+ else
+ postData.append( cmd , sub.obj() );
+ }
+
+
+ void init() { go(); }
+
+ void shutdown() {
+ // TODO
+ }
+
+ private:
+ string _baseurl;
+ int _secsToSleep;
+
+ string _token;
+ string _name;
+
+ } /*mms*/ ;
+
+}
+
+
+
diff --git a/src/mongo/db/mongo.ico b/src/mongo/db/mongo.ico
new file mode 100755
index 00000000000..5258b6e0446
--- /dev/null
+++ b/src/mongo/db/mongo.ico
Binary files differ
diff --git a/src/mongo/db/mongommf.cpp b/src/mongo/db/mongommf.cpp
new file mode 100644
index 00000000000..af2e822404e
--- /dev/null
+++ b/src/mongo/db/mongommf.cpp
@@ -0,0 +1,339 @@
+// @file mongommf.cpp
+
+/**
+* Copyright (C) 2010 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* this module adds some of our layers atop memory mapped files - specifically our handling of private views & such
+ if you don't care about journaling/durability (temp sort files & such) use MemoryMappedFile class, not this.
+*/
+
+#include "pch.h"
+#include "cmdline.h"
+#include "mongommf.h"
+#include "dur.h"
+#include "dur_journalformat.h"
+#include "../util/mongoutils/str.h"
+#include "mongomutex.h"
+#include "d_globals.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+
+#if defined(_WIN32)
+ extern mutex mapViewMutex;
+
+ __declspec(noinline) void makeChunkWritable(size_t chunkno) {
+ scoped_lock lk(mapViewMutex);
+
+ if( writable.get(chunkno) ) // double check lock
+ return;
+
+ // remap all maps in this chunk. common case is a single map, but could have more than one with smallfiles or .ns files
+ size_t chunkStart = chunkno * MemoryMappedFile::ChunkSize;
+ size_t chunkNext = chunkStart + MemoryMappedFile::ChunkSize;
+
+ scoped_lock lk2(privateViews._mutex());
+ map<void*,MongoMMF*>::iterator i = privateViews.finditer_inlock((void*) (chunkNext-1));
+ while( 1 ) {
+ const pair<void*,MongoMMF*> x = *(--i);
+ MongoMMF *mmf = x.second;
+ if( mmf == 0 )
+ break;
+
+ size_t viewStart = (size_t) x.first;
+ size_t viewEnd = (size_t) (viewStart + mmf->length());
+ if( viewEnd <= chunkStart )
+ break;
+
+ size_t protectStart = max(viewStart, chunkStart);
+ dassert(protectStart<chunkNext);
+
+ size_t protectEnd = min(viewEnd, chunkNext);
+ size_t protectSize = protectEnd - protectStart;
+ dassert(protectSize>0&&protectSize<=MemoryMappedFile::ChunkSize);
+
+ DWORD old;
+ bool ok = VirtualProtect((void*)protectStart, protectSize, PAGE_WRITECOPY, &old);
+ if( !ok ) {
+ DWORD e = GetLastError();
+ log() << "VirtualProtect failed (mcw) " << mmf->filename() << ' ' << chunkno << hex << protectStart << ' ' << protectSize << ' ' << errnoWithDescription(e) << endl;
+ assert(false);
+ }
+ }
+
+ writable.set(chunkno);
+ }
+
+ void* MemoryMappedFile::createPrivateMap() {
+ assert( maphandle );
+ scoped_lock lk(mapViewMutex);
+ void *p = MapViewOfFile(maphandle, FILE_MAP_READ, 0, 0, 0);
+ if ( p == 0 ) {
+ DWORD e = GetLastError();
+ log() << "createPrivateMap failed " << filename() << " " <<
+ errnoWithDescription(e) << " filelen:" << len <<
+ ((sizeof(void*) == 4 ) ? " (32 bit build)" : "") <<
+ endl;
+ }
+ else {
+ clearWritableBits(p);
+ views.push_back(p);
+ }
+ return p;
+ }
+
+ void* MemoryMappedFile::remapPrivateView(void *oldPrivateAddr) {
+ d.dbMutex.assertWriteLocked(); // short window where we are unmapped so must be exclusive
+
+ // the mapViewMutex is to assure we get the same address on the remap
+ scoped_lock lk(mapViewMutex);
+
+ clearWritableBits(oldPrivateAddr);
+#if 1
+ // https://jira.mongodb.org/browse/SERVER-2942
+ DWORD old;
+ bool ok = VirtualProtect(oldPrivateAddr, (SIZE_T) len, PAGE_READONLY, &old);
+ if( !ok ) {
+ DWORD e = GetLastError();
+ log() << "VirtualProtect failed in remapPrivateView " << filename() << hex << oldPrivateAddr << ' ' << len << ' ' << errnoWithDescription(e) << endl;
+ assert(false);
+ }
+ return oldPrivateAddr;
+#else
+ if( !UnmapViewOfFile(oldPrivateAddr) ) {
+ DWORD e = GetLastError();
+ log() << "UnMapViewOfFile failed " << filename() << ' ' << errnoWithDescription(e) << endl;
+ assert(false);
+ }
+
+ // we want the new address to be the same as the old address in case things keep pointers around (as namespaceindex does).
+ void *p = MapViewOfFileEx(maphandle, FILE_MAP_READ, 0, 0,
+ /*dwNumberOfBytesToMap 0 means to eof*/0 /*len*/,
+ oldPrivateAddr);
+
+ if ( p == 0 ) {
+ DWORD e = GetLastError();
+ log() << "MapViewOfFileEx failed " << filename() << " " << errnoWithDescription(e) << endl;
+ assert(p);
+ }
+ assert(p == oldPrivateAddr);
+ return p;
+#endif
+ }
+#endif
+
+ void MongoMMF::remapThePrivateView() {
+ assert( cmdLine.dur );
+
+ // todo 1.9 : it turns out we require that we always remap to the same address.
+ // so the remove / add isn't necessary and can be removed
+ privateViews.remove(_view_private);
+ _view_private = remapPrivateView(_view_private);
+ privateViews.add(_view_private, this);
+ }
+
+ /** register view. threadsafe */
+ void PointerToMMF::add(void *view, MongoMMF *f) {
+ assert(view);
+ assert(f);
+ mutex::scoped_lock lk(_m);
+ _views.insert( pair<void*,MongoMMF*>(view,f) );
+ }
+
+ /** de-register view. threadsafe */
+ void PointerToMMF::remove(void *view) {
+ if( view ) {
+ mutex::scoped_lock lk(_m);
+ _views.erase(view);
+ }
+ }
+
+ PointerToMMF::PointerToMMF() : _m("PointerToMMF") {
+#if defined(SIZE_MAX)
+ size_t max = SIZE_MAX;
+#else
+ size_t max = ~((size_t)0);
+#endif
+ assert( max > (size_t) this ); // just checking that no one redef'd SIZE_MAX and that it is sane
+
+ // this way we don't need any boundary checking in _find()
+ _views.insert( pair<void*,MongoMMF*>((void*)0,(MongoMMF*)0) );
+ _views.insert( pair<void*,MongoMMF*>((void*)max,(MongoMMF*)0) );
+ }
+
+ /** underscore version of find is for when you are already locked
+ @param ofs out return our offset in the view
+ @return the MongoMMF to which this pointer belongs
+ */
+ MongoMMF* PointerToMMF::find_inlock(void *p, /*out*/ size_t& ofs) {
+ //
+ // .................memory..........................
+ // v1 p v2
+ // [--------------------] [-------]
+ //
+ // e.g., _find(p) == v1
+ //
+ const pair<void*,MongoMMF*> x = *(--_views.upper_bound(p));
+ MongoMMF *mmf = x.second;
+ if( mmf ) {
+ size_t o = ((char *)p) - ((char*)x.first);
+ if( o < mmf->length() ) {
+ ofs = o;
+ return mmf;
+ }
+ }
+ return 0;
+ }
+
+ /** find associated MMF object for a given pointer.
+ threadsafe
+ @param ofs out returns offset into the view of the pointer, if found.
+ @return the MongoMMF to which this pointer belongs. null if not found.
+ */
+ MongoMMF* PointerToMMF::find(void *p, /*out*/ size_t& ofs) {
+ mutex::scoped_lock lk(_m);
+ return find_inlock(p, ofs);
+ }
+
+ PointerToMMF privateViews;
+
+ /* void* MongoMMF::switchToPrivateView(void *readonly_ptr) {
+ assert( cmdLine.dur );
+ assert( testIntent );
+
+ void *p = readonly_ptr;
+
+ {
+ size_t ofs=0;
+ MongoMMF *mmf = ourReadViews.find(p, ofs);
+ if( mmf ) {
+ void *res = ((char *)mmf->_view_private) + ofs;
+ return res;
+ }
+ }
+
+ {
+ size_t ofs=0;
+ MongoMMF *mmf = privateViews.find(p, ofs);
+ if( mmf ) {
+ log() << "dur: perf warning p=" << p << " is already in the writable view of " << mmf->filename() << endl;
+ return p;
+ }
+ }
+
+ // did you call writing() with a pointer that isn't into a datafile?
+ log() << "dur error switchToPrivateView " << p << endl;
+ return p;
+ }*/
+
+ /* switch to _view_write. normally, this is a bad idea since your changes will not
+ show up in _view_private if there have been changes there; thus the leading underscore
+ as a tad of a "warning". but useful when done with some care, such as during
+ initialization.
+ */
+ void* MongoMMF::_switchToWritableView(void *p) {
+ size_t ofs;
+ MongoMMF *f = privateViews.find(p, ofs);
+ assert( f );
+ return (((char *)f->_view_write)+ofs);
+ }
+
+ extern string dbpath;
+
+ // here so that it is precomputed...
+ void MongoMMF::setPath(string f) {
+ string suffix;
+ string prefix;
+ bool ok = str::rSplitOn(f, '.', prefix, suffix);
+ uassert(13520, str::stream() << "MongoMMF only supports filenames in a certain format " << f, ok);
+ if( suffix == "ns" )
+ _fileSuffixNo = dur::JEntry::DotNsSuffix;
+ else
+ _fileSuffixNo = (int) str::toUnsigned(suffix);
+
+ _p = RelativePath::fromFullPath(prefix);
+ }
+
+ bool MongoMMF::open(string fname, bool sequentialHint) {
+ LOG(3) << "mmf open " << fname << endl;
+ setPath(fname);
+ _view_write = mapWithOptions(fname.c_str(), sequentialHint ? SEQUENTIAL : 0);
+ return finishOpening();
+ }
+
+ bool MongoMMF::create(string fname, unsigned long long& len, bool sequentialHint) {
+ LOG(3) << "mmf create " << fname << endl;
+ setPath(fname);
+ _view_write = map(fname.c_str(), len, sequentialHint ? SEQUENTIAL : 0);
+ return finishOpening();
+ }
+
+ bool MongoMMF::finishOpening() {
+ LOG(3) << "mmf finishOpening " << (void*) _view_write << ' ' << filename() << " len:" << length() << endl;
+ if( _view_write ) {
+ if( cmdLine.dur ) {
+ _view_private = createPrivateMap();
+ if( _view_private == 0 ) {
+ msgasserted(13636, str::stream() << "file " << filename() << " open/create failed in createPrivateMap (look in log for more information)");
+ }
+ privateViews.add(_view_private, this); // note that testIntent builds use this, even though it points to view_write then...
+ }
+ else {
+ _view_private = _view_write;
+ }
+ return true;
+ }
+ return false;
+ }
+
+ MongoMMF::MongoMMF() : _willNeedRemap(false) {
+ _view_write = _view_private = 0;
+ }
+
+ MongoMMF::~MongoMMF() {
+ try {
+ close();
+ }
+ catch(...) { error() << "exception in ~MongoMMF" << endl; }
+ }
+
+ namespace dur {
+ void closingFileNotification();
+ }
+
+ /*virtual*/ void MongoMMF::close() {
+ LOG(3) << "mmf close " << filename() << endl;
+
+ if( view_write() /*actually was opened*/ ) {
+ if( cmdLine.dur ) {
+ dur::closingFileNotification();
+ }
+ if( !d.dbMutex.isWriteLocked() ) {
+ assert( inShutdown() );
+ DEV {
+ log() << "is it really ok to close a mongommf outside a write lock? dbmutex status:" << d.dbMutex.getState() << " file:" << filename() << endl;
+ }
+ }
+ }
+
+ LockMongoFilesExclusive lk;
+ privateViews.remove(_view_private);
+ _view_write = _view_private = 0;
+ MemoryMappedFile::close();
+ }
+
+}
diff --git a/src/mongo/db/mongommf.h b/src/mongo/db/mongommf.h
new file mode 100644
index 00000000000..62a6cdfd3fd
--- /dev/null
+++ b/src/mongo/db/mongommf.h
@@ -0,0 +1,145 @@
+/** @file mongommf.h
+*
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../util/mmap.h"
+#include "../util/paths.h"
+
+namespace mongo {
+
+ /** MongoMMF adds some layers atop memory mapped files - specifically our handling of private views & such.
+ if you don't care about journaling/durability (temp sort files & such) use MemoryMappedFile class,
+ not this.
+ */
+ class MongoMMF : private MemoryMappedFile {
+ protected:
+ virtual void* viewForFlushing() { return _view_write; }
+
+ public:
+ MongoMMF();
+ virtual ~MongoMMF();
+ virtual void close();
+
+ /** @return true if opened ok. */
+ bool open(string fname, bool sequentialHint /*typically we open with this false*/);
+
+ /** @return file length */
+ unsigned long long length() const { return MemoryMappedFile::length(); }
+
+ string filename() const { return MemoryMappedFile::filename(); }
+
+ void flush(bool sync) { MemoryMappedFile::flush(sync); }
+
+ /* Creates with length if DNE, otherwise uses existing file length,
+ passed length.
+ @param sequentialHint if true will be sequentially accessed
+ @return true for ok
+ */
+ bool create(string fname, unsigned long long& len, bool sequentialHint);
+
+ /* Get the "standard" view (which is the private one).
+ @return the private view.
+ */
+ void* getView() const { return _view_private; }
+
+ /* Get the "write" view (which is required for writing).
+ @return the write view.
+ */
+ void* view_write() const { return _view_write; }
+
+
+ /* switch to _view_write. normally, this is a bad idea since your changes will not
+ show up in _view_private if there have been changes there; thus the leading underscore
+ as a tad of a "warning". but useful when done with some care, such as during
+ initialization.
+ */
+ static void* _switchToWritableView(void *private_ptr);
+
+ /** for a filename a/b/c.3
+ filePath() is "a/b/c"
+ fileSuffixNo() is 3
+ if the suffix is "ns", fileSuffixNo -1
+ */
+ const RelativePath& relativePath() const {
+ DEV assert( !_p._p.empty() );
+ return _p;
+ }
+
+ int fileSuffixNo() const { return _fileSuffixNo; }
+
+ /** true if we have written.
+ set in PREPLOGBUFFER, it is NOT set immediately on write intent declaration.
+ reset to false in REMAPPRIVATEVIEW
+ */
+ bool& willNeedRemap() { return _willNeedRemap; }
+
+ void remapThePrivateView();
+
+ virtual bool isMongoMMF() { return true; }
+
+ private:
+
+ void *_view_write;
+ void *_view_private;
+ bool _willNeedRemap;
+ RelativePath _p; // e.g. "somepath/dbname"
+ int _fileSuffixNo; // e.g. 3. -1="ns"
+
+ void setPath(string pathAndFileName);
+ bool finishOpening();
+ };
+
+ /** for durability support we want to be able to map pointers to specific MongoMMF objects.
+ */
+ class PointerToMMF : boost::noncopyable {
+ public:
+ PointerToMMF();
+
+ /** register view.
+ threadsafe
+ */
+ void add(void *view, MongoMMF *f);
+
+ /** de-register view.
+ threadsafe
+ */
+ void remove(void *view);
+
+ /** find associated MMF object for a given pointer.
+ threadsafe
+ @param ofs out returns offset into the view of the pointer, if found.
+ @return the MongoMMF to which this pointer belongs. null if not found.
+ */
+ MongoMMF* find(void *p, /*out*/ size_t& ofs);
+
+ /** for doing many finds in a row with one lock operation */
+ mutex& _mutex() { return _m; }
+ MongoMMF* find_inlock(void *p, /*out*/ size_t& ofs);
+
+ map<void*,MongoMMF*>::iterator finditer_inlock(void *p) { return _views.upper_bound(p); }
+
+ unsigned numberOfViews_inlock() const { return _views.size(); }
+
+ private:
+ mutex _m;
+ map<void*, MongoMMF*> _views;
+ };
+
+ // allows a pointer into any private view of a MongoMMF to be resolved to the MongoMMF object
+ extern PointerToMMF privateViews;
+}
diff --git a/src/mongo/db/mongomutex.h b/src/mongo/db/mongomutex.h
new file mode 100644
index 00000000000..08b091cae9c
--- /dev/null
+++ b/src/mongo/db/mongomutex.h
@@ -0,0 +1,388 @@
+// @file mongomutex.h
+
+/*
+ * Copyright (C) 2010 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* Mutex heirarchy (1 = "leaf")
+ name level
+ Logstream::mutex 1
+ ClientCursor::ccmutex 2
+ dblock 3
+
+ End func name with _inlock to indicate "caller must lock before calling".
+*/
+
+#pragma once
+
+#include "../util/concurrency/rwlock.h"
+#include "../util/mmap.h"
+#include "../util/time_support.h"
+#include "d_globals.h"
+
+namespace mongo {
+
+ class Client;
+ Client* curopWaitingForLock( int type );
+ void curopGotLock(Client*);
+
+ /* mongomutex time stats */
+ class MutexInfo {
+ unsigned long long enter, timeLocked; // microseconds
+ int locked;
+ unsigned long long start; // last as we touch this least often
+ public:
+ MutexInfo() : timeLocked(0) , locked(0) {
+ start = curTimeMicros64();
+ }
+ void entered() {
+ if ( locked == 0 )
+ enter = curTimeMicros64();
+ locked++;
+ assert( locked >= 1 );
+ }
+ void leaving() {
+ locked--;
+ assert( locked >= 0 );
+ if ( locked == 0 )
+ timeLocked += curTimeMicros64() - enter;
+ }
+ int isLocked() const { return locked; }
+ void getTimingInfo(unsigned long long &s, unsigned long long &tl) const {
+ s = start;
+ tl = timeLocked;
+ }
+ unsigned long long getTimeLocked() const { return timeLocked; }
+ };
+
+ /** the 'big lock'. a read/write lock.
+ there is one of these, d.dbMutex.
+
+ generally if you need to declare a mutex use the right primitive class, not this.
+
+ use readlock and writelock classes for scoped locks on this rather than direct
+ manipulation.
+ */
+ class MongoMutex {
+ public:
+ MongoMutex(const char * name);
+
+ /** @return
+ * > 0 write lock
+ * = 0 no lock
+ * < 0 read lock
+ */
+ int getState() const { return _state.get(); }
+
+ bool atLeastReadLocked() const { return _state.get() != 0; }
+ void assertAtLeastReadLocked() const { assert(atLeastReadLocked()); }
+ bool isWriteLocked/*by our thread*/() const { return getState() > 0; }
+ void assertWriteLocked() const {
+ assert( getState() > 0 );
+ DEV assert( !_releasedEarly.get() );
+ }
+
+ // write lock. use the writelock scoped lock class, not this directly.
+ void lock() {
+ if ( _writeLockedAlready() )
+ return;
+
+ _state.set(1);
+
+ curopWaitingForLock( 1 ); // stats
+ _m.lock();
+ MongoFile::markAllWritable(); // for _DEBUG validation -- a no op for release build
+ _acquiredWriteLock();
+ }
+
+ // try write lock
+ bool lock_try( int millis ) {
+ if ( _writeLockedAlready() ) // adjusts _state
+ return true;
+
+ curopWaitingForLock( 1 );
+ bool got = _m.lock_try( millis );
+
+ if ( got ) {
+ _state.set(1);
+ MongoFile::markAllWritable(); // for _DEBUG validation -- a no op for release build
+ _acquiredWriteLock();
+ }
+
+ return got;
+ }
+
+ // un write lock
+ void unlock() {
+ int s = _state.get();
+ if( s > 1 ) {
+ _state.set(s-1); // recursive lock case
+ return;
+ }
+ if( s != 1 ) {
+ if( _releasedEarly.get() ) {
+ _releasedEarly.set(false);
+ return;
+ }
+ massert( 12599, "internal error: attempt to unlock when wasn't in a write lock", false);
+ }
+ _releasingWriteLock();
+ MongoFile::unmarkAllWritable(); // _DEBUG validation
+ _state.set(0);
+ _m.unlock();
+ }
+
+ /* unlock (write lock), and when unlock() is called later,
+ be smart then and don't unlock it again.
+ */
+ void releaseEarly() {
+ assert( getState() == 1 ); // must not be recursive
+ assert( !_releasedEarly.get() );
+ _releasedEarly.set(true);
+ unlock();
+ }
+
+ // read lock. don't call directly, use readlock.
+ void lock_shared() {
+ int s = _state.get();
+ if( s ) {
+ if( s > 0 ) {
+ // already in write lock - just be recursive and stay write locked
+ _state.set(s+1);
+ }
+ else {
+ // already in read lock - recurse
+ _state.set(s-1);
+ }
+ }
+ else {
+ _state.set(-1);
+ Client *c = curopWaitingForLock( -1 );
+ _m.lock_shared();
+ curopGotLock(c);
+ }
+ }
+
+ // try read lock
+ bool lock_shared_try( int millis ) {
+ int s = _state.get();
+ if ( s ) {
+ // we already have a lock, so no need to try
+ lock_shared();
+ return true;
+ }
+
+ /* [dm] should there be
+ Client *c = curopWaitingForLock( 1 );
+ here? i think so. seems to be missing.
+ */
+ bool got = _m.lock_shared_try( millis );
+ if ( got )
+ _state.set(-1);
+ return got;
+ }
+
+ void unlock_shared() {
+ int s = _state.get();
+ if( s > 0 ) {
+ wassert( s > 1 ); /* we must have done a lock write first to have s > 1 */
+ _state.set(s-1);
+ return;
+ }
+ if( s < -1 ) {
+ _state.set(s+1);
+ return;
+ }
+ wassert( s == -1 );
+ _state.set(0);
+ _m.unlock_shared();
+ }
+
+ MutexInfo& info() { return _minfo; }
+
+ private:
+ void lockedExclusively();
+ void unlockingExclusively();
+ void _acquiredWriteLock();
+ void _releasingWriteLock();
+
+ /* @return true if was already write locked. increments recursive lock count. */
+ bool _writeLockedAlready();
+
+ RWLock _m;
+
+ /* > 0 write lock with recurse count
+ < 0 read lock
+ */
+ ThreadLocalValue<int> _state;
+
+ MutexInfo _minfo;
+
+ public:
+ // indicates we need to call dur::REMAPPRIVATEVIEW on the next write lock
+ bool _remapPrivateViewRequested;
+
+ private:
+ /* See the releaseEarly() method.
+ we use a separate TLS value for releasedEarly - that is ok as
+ our normal/common code path, we never even touch it */
+ ThreadLocalValue<bool> _releasedEarly;
+
+ /* this is for fsyncAndLock command. otherwise write lock's greediness will
+ make us block on any attempted write lock the the fsync's lock.
+ */
+ //volatile bool _blockWrites;
+ };
+
+ namespace dur {
+ void REMAPPRIVATEVIEW();
+ void releasingWriteLock(); // because it's hard to include dur.h here
+ }
+
+ inline void MongoMutex::_releasingWriteLock() {
+ dur::releasingWriteLock();
+ unlockingExclusively();
+ }
+
+ inline void MongoMutex::_acquiredWriteLock() {
+ lockedExclusively();
+ if( _remapPrivateViewRequested ) {
+ dur::REMAPPRIVATEVIEW();
+ dassert( !_remapPrivateViewRequested );
+ }
+ }
+
+ string sayClientState();
+
+ /* @return true if was already write locked. increments recursive lock count. */
+ inline bool MongoMutex::_writeLockedAlready() {
+ int s = _state.get();
+ if( s > 0 ) {
+ _state.set(s+1);
+ return true;
+ }
+ massert( 10293 , string("internal error: locks are not upgradeable: ") + sayClientState() , s == 0 );
+ return false;
+ }
+
+ struct writelock {
+ writelock() { d.dbMutex.lock(); }
+ writelock(const string& ns) { d.dbMutex.lock(); }
+ ~writelock() {
+ DESTRUCTOR_GUARD(
+ d.dbMutex.unlock();
+ );
+ }
+ };
+
+ struct readlock {
+ readlock(const string& ns) {
+ d.dbMutex.lock_shared();
+ }
+ readlock() { d.dbMutex.lock_shared(); }
+ ~readlock() {
+ DESTRUCTOR_GUARD(
+ d.dbMutex.unlock_shared();
+ );
+ }
+ };
+ struct readlocktry {
+ readlocktry( const string&ns , int tryms ) {
+ _got = d.dbMutex.lock_shared_try( tryms );
+ }
+ ~readlocktry() {
+ if ( _got ) {
+ d.dbMutex.unlock_shared();
+ }
+ }
+ bool got() const { return _got; }
+ private:
+ bool _got;
+ };
+
+ struct writelocktry {
+ writelocktry( const string&ns , int tryms ) {
+ _got = d.dbMutex.lock_try( tryms );
+ }
+ ~writelocktry() {
+ if ( _got ) {
+ d.dbMutex.unlock();
+ }
+ }
+ bool got() const { return _got; }
+ private:
+ bool _got;
+ };
+
+ struct readlocktryassert : public readlocktry {
+ readlocktryassert(const string& ns, int tryms) :
+ readlocktry(ns,tryms) {
+ uassert(13142, "timeout getting readlock", got());
+ }
+ };
+
+ /** assure we have at least a read lock - they key with this being
+ if you have a write lock, that's ok too.
+ */
+ struct atleastreadlock {
+ atleastreadlock( const string& ns = "" ) {
+ _prev = d.dbMutex.getState();
+ if ( _prev == 0 )
+ d.dbMutex.lock_shared();
+ }
+ ~atleastreadlock() {
+ if ( _prev == 0 )
+ d.dbMutex.unlock_shared();
+ }
+ private:
+ int _prev;
+ };
+
+ /* parameterized choice of read or write locking
+ use readlock and writelock instead of this when statically known which you want
+ */
+ class mongolock {
+ bool _writelock;
+ public:
+ mongolock(bool write) : _writelock(write) {
+ if( _writelock ) {
+ d.dbMutex.lock();
+ }
+ else
+ d.dbMutex.lock_shared();
+ }
+ ~mongolock() {
+ DESTRUCTOR_GUARD(
+ if( _writelock ) {
+ d.dbMutex.unlock();
+ }
+ else {
+ d.dbMutex.unlock_shared();
+ }
+ );
+ }
+ /* this unlocks, does NOT upgrade. that works for our current usage */
+ //void releaseAndWriteLock();
+ };
+
+ /* deprecated - use writelock and readlock instead */
+ struct dblock : public writelock {
+ dblock() : writelock("") { }
+ };
+
+ // eliminate this - we should just type "d.dbMutex.assertWriteLocked();" instead
+ inline void assertInWriteLock() { d.dbMutex.assertWriteLocked(); }
+
+}
diff --git a/src/mongo/db/namespace-inl.h b/src/mongo/db/namespace-inl.h
new file mode 100644
index 00000000000..a621a229546
--- /dev/null
+++ b/src/mongo/db/namespace-inl.h
@@ -0,0 +1,132 @@
+// @file namespace-inl.h
+
+/**
+* Copyright (C) 2009 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "namespace.h"
+
+namespace mongo {
+
+ inline Namespace& Namespace::operator=(const char *ns) {
+ // we fill the remaining space with all zeroes here. as the full Namespace struct is in
+ // the datafiles (the .ns files specifically), that is helpful as then they are deterministic
+ // in the bytes they have for a given sequence of operations. that makes testing and debugging
+ // the data files easier.
+ //
+ // if profiling indicates this method is a significant bottleneck, we could have a version we
+ // use for reads which does not fill with zeroes, and keep the zeroing behavior on writes.
+ //
+ unsigned len = strlen(ns);
+ uassert( 10080 , "ns name too long, max size is 128", len < MaxNsLen);
+ memset(buf, 0, MaxNsLen);
+ memcpy(buf, ns, len);
+ return *this;
+ }
+
+ inline string Namespace::extraName(int i) const {
+ char ex[] = "$extra";
+ ex[5] += i;
+ string s = string(buf) + ex;
+ massert( 10348 , "$extra: ns name too long", s.size() < MaxNsLen);
+ return s;
+ }
+
+ inline bool Namespace::isExtra() const {
+ const char *p = strstr(buf, "$extr");
+ return p && p[5] && p[6] == 0; //==0 important in case an index uses name "$extra_1" for example
+ }
+
+ inline int Namespace::hash() const {
+ unsigned x = 0;
+ const char *p = buf;
+ while ( *p ) {
+ x = x * 131 + *p;
+ p++;
+ }
+ return (x & 0x7fffffff) | 0x8000000; // must be > 0
+ }
+
+ /* future : this doesn't need to be an inline. */
+ inline string Namespace::getSisterNS( const char * local ) const {
+ assert( local && local[0] != '.' );
+ string old(buf);
+ if ( old.find( "." ) != string::npos )
+ old = old.substr( 0 , old.find( "." ) );
+ return old + "." + local;
+ }
+
+ inline IndexDetails& NamespaceDetails::idx(int idxNo, bool missingExpected ) {
+ if( idxNo < NIndexesBase ) {
+ IndexDetails& id = _indexes[idxNo];
+ return id;
+ }
+ Extra *e = extra();
+ if ( ! e ) {
+ if ( missingExpected )
+ throw MsgAssertionException( 13283 , "Missing Extra" );
+ massert(14045, "missing Extra", e);
+ }
+ int i = idxNo - NIndexesBase;
+ if( i >= NIndexesExtra ) {
+ e = e->next(this);
+ if ( ! e ) {
+ if ( missingExpected )
+ throw MsgAssertionException( 14823 , "missing extra" );
+ massert(14824, "missing Extra", e);
+ }
+ i -= NIndexesExtra;
+ }
+ return e->details[i];
+ }
+
+ inline int NamespaceDetails::idxNo(IndexDetails& idx) {
+ IndexIterator i = ii();
+ while( i.more() ) {
+ if( &i.next() == &idx )
+ return i.pos()-1;
+ }
+ massert( 10349 , "E12000 idxNo fails", false);
+ return -1;
+ }
+
+ inline int NamespaceDetails::findIndexByKeyPattern(const BSONObj& keyPattern) {
+ IndexIterator i = ii();
+ while( i.more() ) {
+ if( i.next().keyPattern() == keyPattern )
+ return i.pos()-1;
+ }
+ return -1;
+ }
+
+ // @return offset in indexes[]
+ inline int NamespaceDetails::findIndexByName(const char *name) {
+ IndexIterator i = ii();
+ while( i.more() ) {
+ if ( strcmp(i.next().info.obj().getStringField("name"),name) == 0 )
+ return i.pos()-1;
+ }
+ return -1;
+ }
+
+ inline NamespaceDetails::IndexIterator::IndexIterator(NamespaceDetails *_d) {
+ d = _d;
+ i = 0;
+ n = d->nIndexes;
+ }
+
+}
diff --git a/src/mongo/db/namespace.cpp b/src/mongo/db/namespace.cpp
new file mode 100644
index 00000000000..af8b5694248
--- /dev/null
+++ b/src/mongo/db/namespace.cpp
@@ -0,0 +1,800 @@
+// namespace.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "pdfile.h"
+#include "db.h"
+#include "mongommf.h"
+#include "../util/hashtab.h"
+#include "../scripting/engine.h"
+#include "btree.h"
+#include <algorithm>
+#include <list>
+#include "queryutil.h"
+#include "json.h"
+#include "ops/delete.h"
+#include "ops/query.h"
+
+namespace mongo {
+
+ BOOST_STATIC_ASSERT( sizeof(Namespace) == 128 );
+
+ BSONObj idKeyPattern = fromjson("{\"_id\":1}");
+
+ /* deleted lists -- linked lists of deleted records -- are placed in 'buckets' of various sizes
+ so you can look for a deleterecord about the right size.
+ */
+ int bucketSizes[] = {
+ 32, 64, 128, 256, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000,
+ 0x8000, 0x10000, 0x20000, 0x40000, 0x80000, 0x100000, 0x200000,
+ 0x400000, 0x800000
+ };
+
+ NamespaceDetails::NamespaceDetails( const DiskLoc &loc, bool _capped ) {
+ /* be sure to initialize new fields here -- doesn't default to zeroes the way we use it */
+ firstExtent = lastExtent = capExtent = loc;
+ stats.datasize = stats.nrecords = 0;
+ lastExtentSize = 0;
+ nIndexes = 0;
+ capped = _capped;
+ max = 0x7fffffff;
+ paddingFactor = 1.0;
+ flags = 0;
+ capFirstNewRecord = DiskLoc();
+ // Signal that we are on first allocation iteration through extents.
+ capFirstNewRecord.setInvalid();
+ // For capped case, signal that we are doing initial extent allocation.
+ if ( capped )
+ cappedLastDelRecLastExtent().setInvalid();
+ assert( sizeof(dataFileVersion) == 2 );
+ dataFileVersion = 0;
+ indexFileVersion = 0;
+ multiKeyIndexBits = 0;
+ reservedA = 0;
+ extraOffset = 0;
+ indexBuildInProgress = 0;
+ reservedB = 0;
+ capped2.cc2_ptr = 0;
+ capped2.fileNumber = 0;
+ memset(reserved, 0, sizeof(reserved));
+ }
+
+ bool NamespaceIndex::exists() const {
+ return !MMF::exists(path());
+ }
+
+ boost::filesystem::path NamespaceIndex::path() const {
+ boost::filesystem::path ret( dir_ );
+ if ( directoryperdb )
+ ret /= database_;
+ ret /= ( database_ + ".ns" );
+ return ret;
+ }
+
+ void NamespaceIndex::maybeMkdir() const {
+ if ( !directoryperdb )
+ return;
+ boost::filesystem::path dir( dir_ );
+ dir /= database_;
+ if ( !boost::filesystem::exists( dir ) )
+ MONGO_BOOST_CHECK_EXCEPTION_WITH_MSG( boost::filesystem::create_directory( dir ), "create dir for db " );
+ }
+
+ unsigned lenForNewNsFiles = 16 * 1024 * 1024;
+
+#if defined(_DEBUG)
+ void NamespaceDetails::dump(const Namespace& k) {
+ if( !cmdLine.dur )
+ cout << "ns offsets which follow will not display correctly with --journal disabled" << endl;
+
+ size_t ofs = 1; // 1 is sentinel that the find call below failed
+ privateViews.find(this, /*out*/ofs);
+
+ cout << "ns" << hex << setw(8) << ofs << ' ';
+ cout << k.toString() << '\n';
+
+ if( k.isExtra() ) {
+ cout << "ns\t extra" << endl;
+ return;
+ }
+
+ cout << "ns " << firstExtent.toString() << ' ' << lastExtent.toString() << " nidx:" << nIndexes << '\n';
+ cout << "ns " << stats.datasize << ' ' << stats.nrecords << ' ' << nIndexes << '\n';
+ cout << "ns " << capped << ' ' << paddingFactor << ' ' << flags << ' ' << dataFileVersion << '\n';
+ cout << "ns " << multiKeyIndexBits << ' ' << indexBuildInProgress << '\n';
+ cout << "ns " << (int) reserved[0] << ' ' << (int) reserved[59];
+ cout << endl;
+ }
+#endif
+
+ void NamespaceDetails::onLoad(const Namespace& k) {
+
+ if( k.isExtra() ) {
+ /* overflow storage for indexes - so don't treat as a NamespaceDetails object. */
+ return;
+ }
+
+ if( indexBuildInProgress || capped2.cc2_ptr ) {
+ assertInWriteLock();
+ if( indexBuildInProgress ) {
+ log() << "indexBuildInProgress was " << indexBuildInProgress << " for " << k << ", indicating an abnormal db shutdown" << endl;
+ getDur().writingInt( indexBuildInProgress ) = 0;
+ }
+ if( capped2.cc2_ptr )
+ *getDur().writing(&capped2.cc2_ptr) = 0;
+ }
+ }
+
+ static void namespaceOnLoadCallback(const Namespace& k, NamespaceDetails& v) {
+ v.onLoad(k);
+ }
+
+ bool checkNsFilesOnLoad = true;
+
+ NOINLINE_DECL void NamespaceIndex::_init() {
+ assert( !ht );
+
+ d.dbMutex.assertWriteLocked();
+
+ /* if someone manually deleted the datafiles for a database,
+ we need to be sure to clear any cached info for the database in
+ local.*.
+ */
+ /*
+ if ( "local" != database_ ) {
+ DBInfo i(database_.c_str());
+ i.dbDropped();
+ }
+ */
+
+ unsigned long long len = 0;
+ boost::filesystem::path nsPath = path();
+ string pathString = nsPath.string();
+ void *p = 0;
+ if( MMF::exists(nsPath) ) {
+ if( f.open(pathString, true) ) {
+ len = f.length();
+ if ( len % (1024*1024) != 0 ) {
+ log() << "bad .ns file: " << pathString << endl;
+ uassert( 10079 , "bad .ns file length, cannot open database", len % (1024*1024) == 0 );
+ }
+ p = f.getView();
+ }
+ }
+ else {
+ // use lenForNewNsFiles, we are making a new database
+ massert( 10343, "bad lenForNewNsFiles", lenForNewNsFiles >= 1024*1024 );
+ maybeMkdir();
+ unsigned long long l = lenForNewNsFiles;
+ if( f.create(pathString, l, true) ) {
+ getDur().createdFile(pathString, l); // always a new file
+ len = l;
+ assert( len == lenForNewNsFiles );
+ p = f.getView();
+ }
+ }
+
+ if ( p == 0 ) {
+ /** TODO: this shouldn't terminate? */
+ log() << "error couldn't open file " << pathString << " terminating" << endl;
+ dbexit( EXIT_FS );
+ }
+
+
+ assert( len <= 0x7fffffff );
+ ht = new HashTable<Namespace,NamespaceDetails>(p, (int) len, "namespace index");
+ if( checkNsFilesOnLoad )
+ ht->iterAll(namespaceOnLoadCallback);
+ }
+
+ static void namespaceGetNamespacesCallback( const Namespace& k , NamespaceDetails& v , void * extra ) {
+ list<string> * l = (list<string>*)extra;
+ if ( ! k.hasDollarSign() )
+ l->push_back( (string)k );
+ }
+ void NamespaceIndex::getNamespaces( list<string>& tofill , bool onlyCollections ) const {
+ assert( onlyCollections ); // TODO: need to implement this
+ // need boost::bind or something to make this less ugly
+
+ if ( ht )
+ ht->iterAll( namespaceGetNamespacesCallback , (void*)&tofill );
+ }
+
+ void NamespaceDetails::addDeletedRec(DeletedRecord *d, DiskLoc dloc) {
+ BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) <= sizeof(NamespaceDetails) );
+
+ {
+ Record *r = (Record *) getDur().writingPtr(d, sizeof(Record));
+ d = &r->asDeleted();
+ // defensive code: try to make us notice if we reference a deleted record
+ (unsigned&) (r->data) = 0xeeeeeeee;
+ }
+ DEBUGGING log() << "TEMP: add deleted rec " << dloc.toString() << ' ' << hex << d->extentOfs << endl;
+ if ( capped ) {
+ if ( !cappedLastDelRecLastExtent().isValid() ) {
+ // Initial extent allocation. Insert at end.
+ d->nextDeleted = DiskLoc();
+ if ( cappedListOfAllDeletedRecords().isNull() )
+ getDur().writingDiskLoc( cappedListOfAllDeletedRecords() ) = dloc;
+ else {
+ DiskLoc i = cappedListOfAllDeletedRecords();
+ for (; !i.drec()->nextDeleted.isNull(); i = i.drec()->nextDeleted )
+ ;
+ i.drec()->nextDeleted.writing() = dloc;
+ }
+ }
+ else {
+ d->nextDeleted = cappedFirstDeletedInCurExtent();
+ getDur().writingDiskLoc( cappedFirstDeletedInCurExtent() ) = dloc;
+ // always compact() after this so order doesn't matter
+ }
+ }
+ else {
+ int b = bucket(d->lengthWithHeaders);
+ DiskLoc& list = deletedList[b];
+ DiskLoc oldHead = list;
+ getDur().writingDiskLoc(list) = dloc;
+ d->nextDeleted = oldHead;
+ }
+ }
+
+ /* predetermine location of the next alloc without actually doing it.
+ if cannot predetermine returns null (so still call alloc() then)
+ */
+ DiskLoc NamespaceDetails::allocWillBeAt(const char *ns, int lenToAlloc) {
+ if ( !capped ) {
+ lenToAlloc = (lenToAlloc + 3) & 0xfffffffc;
+ return __stdAlloc(lenToAlloc, true);
+ }
+ return DiskLoc();
+ }
+
+ /** allocate space for a new record from deleted lists.
+ @param lenToAlloc is WITH header
+ @param extentLoc OUT returns the extent location
+ @return null diskloc if no room - allocate a new extent then
+ */
+ DiskLoc NamespaceDetails::alloc(const char *ns, int lenToAlloc, DiskLoc& extentLoc) {
+ {
+ // align very slightly.
+ // note that if doing more coarse-grained quantization (really just if it isn't always
+ // a constant amount but if it varied by record size) then that quantization should
+ // NOT be done here but rather in __stdAlloc so that we can grab a deletedrecord that
+ // is just big enough if we happen to run into one.
+ lenToAlloc = (lenToAlloc + 3) & 0xfffffffc;
+ }
+
+ DiskLoc loc = _alloc(ns, lenToAlloc);
+ if ( loc.isNull() )
+ return loc;
+
+ const DeletedRecord *r = loc.drec();
+ //r = getDur().writing(r);
+
+ /* note we want to grab from the front so our next pointers on disk tend
+ to go in a forward direction which is important for performance. */
+ int regionlen = r->lengthWithHeaders;
+ extentLoc.set(loc.a(), r->extentOfs);
+ assert( r->extentOfs < loc.getOfs() );
+
+ DEBUGGING out() << "TEMP: alloc() returns " << loc.toString() << ' ' << ns << " lentoalloc:" << lenToAlloc << " ext:" << extentLoc.toString() << endl;
+
+ int left = regionlen - lenToAlloc;
+ if ( capped == 0 ) {
+ if ( left < 24 || left < (lenToAlloc >> 3) ) {
+ // you get the whole thing.
+ return loc;
+ }
+ }
+
+ /* split off some for further use. */
+ getDur().writingInt(r->lengthWithHeaders) = lenToAlloc;
+ DiskLoc newDelLoc = loc;
+ newDelLoc.inc(lenToAlloc);
+ DeletedRecord *newDel = DataFileMgr::makeDeletedRecord(newDelLoc, left);
+ DeletedRecord *newDelW = getDur().writing(newDel);
+ newDelW->extentOfs = r->extentOfs;
+ newDelW->lengthWithHeaders = left;
+ newDelW->nextDeleted.Null();
+
+ addDeletedRec(newDel, newDelLoc);
+
+ return loc;
+ }
+
+ /* for non-capped collections.
+ @param peekOnly just look up where and don't reserve
+ returned item is out of the deleted list upon return
+ */
+ DiskLoc NamespaceDetails::__stdAlloc(int len, bool peekOnly) {
+ DiskLoc *prev;
+ DiskLoc *bestprev = 0;
+ DiskLoc bestmatch;
+ int bestmatchlen = 0x7fffffff;
+ int b = bucket(len);
+ DiskLoc cur = deletedList[b];
+ prev = &deletedList[b];
+ int extra = 5; // look for a better fit, a little.
+ int chain = 0;
+ while ( 1 ) {
+ {
+ int a = cur.a();
+ if ( a < -1 || a >= 100000 ) {
+ problem() << "~~ Assertion - cur out of range in _alloc() " << cur.toString() <<
+ " a:" << a << " b:" << b << " chain:" << chain << '\n';
+ sayDbContext();
+ if ( cur == *prev )
+ prev->Null();
+ cur.Null();
+ }
+ }
+ if ( cur.isNull() ) {
+ // move to next bucket. if we were doing "extra", just break
+ if ( bestmatchlen < 0x7fffffff )
+ break;
+ b++;
+ if ( b > MaxBucket ) {
+ // out of space. alloc a new extent.
+ return DiskLoc();
+ }
+ cur = deletedList[b];
+ prev = &deletedList[b];
+ continue;
+ }
+ DeletedRecord *r = cur.drec();
+ if ( r->lengthWithHeaders >= len &&
+ r->lengthWithHeaders < bestmatchlen ) {
+ bestmatchlen = r->lengthWithHeaders;
+ bestmatch = cur;
+ bestprev = prev;
+ }
+ if ( bestmatchlen < 0x7fffffff && --extra <= 0 )
+ break;
+ if ( ++chain > 30 && b < MaxBucket ) {
+ // too slow, force move to next bucket to grab a big chunk
+ //b++;
+ chain = 0;
+ cur.Null();
+ }
+ else {
+ /*this defensive check only made sense for the mmap storage engine:
+ if ( r->nextDeleted.getOfs() == 0 ) {
+ problem() << "~~ Assertion - bad nextDeleted " << r->nextDeleted.toString() <<
+ " b:" << b << " chain:" << chain << ", fixing.\n";
+ r->nextDeleted.Null();
+ }*/
+ cur = r->nextDeleted;
+ prev = &r->nextDeleted;
+ }
+ }
+
+ /* unlink ourself from the deleted list */
+ if( !peekOnly ) {
+ const DeletedRecord *bmr = bestmatch.drec();
+ *getDur().writing(bestprev) = bmr->nextDeleted;
+ bmr->nextDeleted.writing().setInvalid(); // defensive.
+ assert(bmr->extentOfs < bestmatch.getOfs());
+ }
+
+ return bestmatch;
+ }
+
+ void NamespaceDetails::dumpDeleted(set<DiskLoc> *extents) {
+ for ( int i = 0; i < Buckets; i++ ) {
+ DiskLoc dl = deletedList[i];
+ while ( !dl.isNull() ) {
+ DeletedRecord *r = dl.drec();
+ DiskLoc extLoc(dl.a(), r->extentOfs);
+ if ( extents == 0 || extents->count(extLoc) <= 0 ) {
+ out() << " bucket " << i << endl;
+ out() << " " << dl.toString() << " ext:" << extLoc.toString();
+ if ( extents && extents->count(extLoc) <= 0 )
+ out() << '?';
+ out() << " len:" << r->lengthWithHeaders << endl;
+ }
+ dl = r->nextDeleted;
+ }
+ }
+ }
+
+ DiskLoc NamespaceDetails::firstRecord( const DiskLoc &startExtent ) const {
+ for (DiskLoc i = startExtent.isNull() ? firstExtent : startExtent;
+ !i.isNull(); i = i.ext()->xnext ) {
+ if ( !i.ext()->firstRecord.isNull() )
+ return i.ext()->firstRecord;
+ }
+ return DiskLoc();
+ }
+
+ DiskLoc NamespaceDetails::lastRecord( const DiskLoc &startExtent ) const {
+ for (DiskLoc i = startExtent.isNull() ? lastExtent : startExtent;
+ !i.isNull(); i = i.ext()->xprev ) {
+ if ( !i.ext()->lastRecord.isNull() )
+ return i.ext()->lastRecord;
+ }
+ return DiskLoc();
+ }
+
+ int n_complaints_cap = 0;
+ void NamespaceDetails::maybeComplain( const char *ns, int len ) const {
+ if ( ++n_complaints_cap < 8 ) {
+ out() << "couldn't make room for new record (len: " << len << ") in capped ns " << ns << '\n';
+ int i = 0;
+ for ( DiskLoc e = firstExtent; !e.isNull(); e = e.ext()->xnext, ++i ) {
+ out() << " Extent " << i;
+ if ( e == capExtent )
+ out() << " (capExtent)";
+ out() << '\n';
+ out() << " magic: " << hex << e.ext()->magic << dec << " extent->ns: " << e.ext()->nsDiagnostic.toString() << '\n';
+ out() << " fr: " << e.ext()->firstRecord.toString() <<
+ " lr: " << e.ext()->lastRecord.toString() << " extent->len: " << e.ext()->length << '\n';
+ }
+ assert( len * 5 > lastExtentSize ); // assume it is unusually large record; if not, something is broken
+ }
+ }
+
+ /* alloc with capped table handling. */
+ DiskLoc NamespaceDetails::_alloc(const char *ns, int len) {
+ if ( !capped )
+ return __stdAlloc(len, false);
+
+ return cappedAlloc(ns,len);
+ }
+
+ void NamespaceIndex::kill_ns(const char *ns) {
+ d.dbMutex.assertWriteLocked();
+ if ( !ht )
+ return;
+ Namespace n(ns);
+ ht->kill(n);
+
+ for( int i = 0; i<=1; i++ ) {
+ try {
+ Namespace extra(n.extraName(i).c_str());
+ ht->kill(extra);
+ }
+ catch(DBException&) {
+ dlog(3) << "caught exception in kill_ns" << endl;
+ }
+ }
+ }
+
+ void NamespaceIndex::add_ns(const char *ns, DiskLoc& loc, bool capped) {
+ NamespaceDetails details( loc, capped );
+ add_ns( ns, details );
+ }
+ void NamespaceIndex::add_ns( const char *ns, const NamespaceDetails &details ) {
+ d.dbMutex.assertWriteLocked();
+ init();
+ Namespace n(ns);
+ uassert( 10081 , "too many namespaces/collections", ht->put(n, details));
+ }
+
+ /* extra space for indexes when more than 10 */
+ NamespaceDetails::Extra* NamespaceIndex::newExtra(const char *ns, int i, NamespaceDetails *d) {
+ mongo::d.dbMutex.assertWriteLocked();
+ assert( i >= 0 && i <= 1 );
+ Namespace n(ns);
+ Namespace extra(n.extraName(i).c_str()); // throws userexception if ns name too long
+
+ massert( 10350 , "allocExtra: base ns missing?", d );
+ massert( 10351 , "allocExtra: extra already exists", ht->get(extra) == 0 );
+
+ NamespaceDetails::Extra temp;
+ temp.init();
+ uassert( 10082 , "allocExtra: too many namespaces/collections", ht->put(extra, (NamespaceDetails&) temp));
+ NamespaceDetails::Extra *e = (NamespaceDetails::Extra *) ht->get(extra);
+ return e;
+ }
+ NamespaceDetails::Extra* NamespaceDetails::allocExtra(const char *ns, int nindexessofar) {
+ NamespaceIndex *ni = nsindex(ns);
+ int i = (nindexessofar - NIndexesBase) / NIndexesExtra;
+ Extra *e = ni->newExtra(ns, i, this);
+ long ofs = e->ofsFrom(this);
+ if( i == 0 ) {
+ assert( extraOffset == 0 );
+ *getDur().writing(&extraOffset) = ofs;
+ assert( extra() == e );
+ }
+ else {
+ Extra *hd = extra();
+ assert( hd->next(this) == 0 );
+ hd->setNext(ofs);
+ }
+ return e;
+ }
+
+ /* you MUST call when adding an index. see pdfile.cpp */
+ IndexDetails& NamespaceDetails::addIndex(const char *thisns, bool resetTransient) {
+ IndexDetails *id;
+ try {
+ id = &idx(nIndexes,true);
+ }
+ catch(DBException&) {
+ allocExtra(thisns, nIndexes);
+ id = &idx(nIndexes,false);
+ }
+
+ (*getDur().writing(&nIndexes))++;
+ if ( resetTransient )
+ NamespaceDetailsTransient::get(thisns).addedIndex();
+ return *id;
+ }
+
+ // must be called when renaming a NS to fix up extra
+ void NamespaceDetails::copyingFrom(const char *thisns, NamespaceDetails *src) {
+ extraOffset = 0; // we are a copy -- the old value is wrong. fixing it up below.
+ Extra *se = src->extra();
+ int n = NIndexesBase;
+ if( se ) {
+ Extra *e = allocExtra(thisns, n);
+ while( 1 ) {
+ n += NIndexesExtra;
+ e->copy(this, *se);
+ se = se->next(src);
+ if( se == 0 ) break;
+ Extra *nxt = allocExtra(thisns, n);
+ e->setNext( nxt->ofsFrom(this) );
+ e = nxt;
+ }
+ assert( extraOffset );
+ }
+ }
+
+ /* returns index of the first index in which the field is present. -1 if not present.
+ (aug08 - this method not currently used)
+ */
+ int NamespaceDetails::fieldIsIndexed(const char *fieldName) {
+ massert( 10346 , "not implemented", false);
+ /*
+ for ( int i = 0; i < nIndexes; i++ ) {
+ IndexDetails& idx = indexes[i];
+ BSONObj idxKey = idx.info.obj().getObjectField("key"); // e.g., { ts : -1 }
+ if ( !idxKey.getField(fieldName).eoo() )
+ return i;
+ }*/
+ return -1;
+ }
+
+ long long NamespaceDetails::storageSize( int * numExtents , BSONArrayBuilder * extentInfo ) const {
+ Extent * e = firstExtent.ext();
+ assert( e );
+
+ long long total = 0;
+ int n = 0;
+ while ( e ) {
+ total += e->length;
+ n++;
+
+ if ( extentInfo ) {
+ extentInfo->append( BSON( "len" << e->length << "loc: " << e->myLoc.toBSONObj() ) );
+ }
+
+ e = e->getNextExtent();
+ }
+
+ if ( numExtents )
+ *numExtents = n;
+
+ return total;
+ }
+
+ NamespaceDetails *NamespaceDetails::writingWithExtra() {
+ vector< pair< long long, unsigned > > writeRanges;
+ writeRanges.push_back( make_pair( 0, sizeof( NamespaceDetails ) ) );
+ for( Extra *e = extra(); e; e = e->next( this ) ) {
+ writeRanges.push_back( make_pair( (char*)e - (char*)this, sizeof( Extra ) ) );
+ }
+ return reinterpret_cast< NamespaceDetails* >( getDur().writingRangesAtOffsets( this, writeRanges ) );
+ }
+
+ /* ------------------------------------------------------------------------- */
+
+ SimpleMutex NamespaceDetailsTransient::_qcMutex("qc");
+ SimpleMutex NamespaceDetailsTransient::_isMutex("is");
+ map< string, shared_ptr< NamespaceDetailsTransient > > NamespaceDetailsTransient::_nsdMap;
+ typedef map< string, shared_ptr< NamespaceDetailsTransient > >::iterator ouriter;
+
+ void NamespaceDetailsTransient::reset() {
+ DEV assertInWriteLock();
+ clearQueryCache();
+ _keysComputed = false;
+ _indexSpecs.clear();
+ }
+
+ /*static*/ NOINLINE_DECL NamespaceDetailsTransient& NamespaceDetailsTransient::make_inlock(const char *ns) {
+ shared_ptr< NamespaceDetailsTransient > &t = _nsdMap[ ns ];
+ assert( t.get() == 0 );
+ Database *database = cc().database();
+ assert( database );
+ if( _nsdMap.size() % 20000 == 10000 ) {
+ // so we notice if insanely large #s
+ log() << "opening namespace " << ns << endl;
+ log() << _nsdMap.size() << " namespaces in nsdMap" << endl;
+ }
+ t.reset( new NamespaceDetailsTransient(database, ns) );
+ return *t;
+ }
+
+ // note with repair there could be two databases with the same ns name.
+ // that is NOT handled here yet! TODO
+ // repair may not use nsdt though not sure. anyway, requires work.
+ NamespaceDetailsTransient::NamespaceDetailsTransient(Database *db, const char *ns) :
+ _ns(ns), _keysComputed(false), _qcWriteCount()
+ {
+ dassert(db);
+ }
+
+ NamespaceDetailsTransient::~NamespaceDetailsTransient() {
+ }
+
+ void NamespaceDetailsTransient::clearForPrefix(const char *prefix) {
+ assertInWriteLock();
+ vector< string > found;
+ for( ouriter i = _nsdMap.begin(); i != _nsdMap.end(); ++i )
+ if ( strncmp( i->first.c_str(), prefix, strlen( prefix ) ) == 0 )
+ found.push_back( i->first );
+ for( vector< string >::iterator i = found.begin(); i != found.end(); ++i ) {
+ _nsdMap[ *i ].reset();
+ }
+ }
+
+ void NamespaceDetailsTransient::eraseForPrefix(const char *prefix) {
+ assertInWriteLock();
+ vector< string > found;
+ for( ouriter i = _nsdMap.begin(); i != _nsdMap.end(); ++i )
+ if ( strncmp( i->first.c_str(), prefix, strlen( prefix ) ) == 0 )
+ found.push_back( i->first );
+ for( vector< string >::iterator i = found.begin(); i != found.end(); ++i ) {
+ _nsdMap.erase(*i);
+ }
+ }
+
+ void NamespaceDetailsTransient::computeIndexKeys() {
+ _keysComputed = true;
+ _indexKeys.clear();
+ NamespaceDetails *d = nsdetails(_ns.c_str());
+ if ( ! d )
+ return;
+ NamespaceDetails::IndexIterator i = d->ii();
+ while( i.more() )
+ i.next().keyPattern().getFieldNames(_indexKeys);
+ }
+
+
+ /* ------------------------------------------------------------------------- */
+
+ /* add a new namespace to the system catalog (<dbname>.system.namespaces).
+ options: { capped : ..., size : ... }
+ */
+ void addNewNamespaceToCatalog(const char *ns, const BSONObj *options = 0) {
+ LOG(1) << "New namespace: " << ns << endl;
+ if ( strstr(ns, "system.namespaces") ) {
+ // system.namespaces holds all the others, so it is not explicitly listed in the catalog.
+ // TODO: fix above should not be strstr!
+ return;
+ }
+
+ {
+ BSONObjBuilder b;
+ b.append("name", ns);
+ if ( options )
+ b.append("options", *options);
+ BSONObj j = b.done();
+ char database[256];
+ nsToDatabase(ns, database);
+ string s = database;
+ if( cmdLine.configsvr && (s != "config" && s != "admin") ) {
+ uasserted(14037, "can't create user databases on a --configsvr instance");
+ }
+ s += ".system.namespaces";
+ theDataFileMgr.insert(s.c_str(), j.objdata(), j.objsize(), true);
+ }
+ }
+
+ void renameNamespace( const char *from, const char *to ) {
+ NamespaceIndex *ni = nsindex( from );
+ assert( ni );
+ assert( ni->details( from ) );
+ assert( ! ni->details( to ) );
+
+ // Our namespace and index details will move to a different
+ // memory location. The only references to namespace and
+ // index details across commands are in cursors and nsd
+ // transient (including query cache) so clear these.
+ ClientCursor::invalidate( from );
+ NamespaceDetailsTransient::eraseForPrefix( from );
+
+ NamespaceDetails *details = ni->details( from );
+ ni->add_ns( to, *details );
+ NamespaceDetails *todetails = ni->details( to );
+ try {
+ todetails->copyingFrom(to, details); // fixes extraOffset
+ }
+ catch( DBException& ) {
+ // could end up here if .ns is full - if so try to clean up / roll back a little
+ ni->kill_ns(to);
+ throw;
+ }
+ ni->kill_ns( from );
+ details = todetails;
+
+ BSONObj oldSpec;
+ char database[MaxDatabaseNameLen];
+ nsToDatabase(from, database);
+ string s = database;
+ s += ".system.namespaces";
+ assert( Helpers::findOne( s.c_str(), BSON( "name" << from ), oldSpec ) );
+
+ BSONObjBuilder newSpecB;
+ BSONObjIterator i( oldSpec.getObjectField( "options" ) );
+ while( i.more() ) {
+ BSONElement e = i.next();
+ if ( strcmp( e.fieldName(), "create" ) != 0 )
+ newSpecB.append( e );
+ else
+ newSpecB << "create" << to;
+ }
+ BSONObj newSpec = newSpecB.done();
+ addNewNamespaceToCatalog( to, newSpec.isEmpty() ? 0 : &newSpec );
+
+ deleteObjects( s.c_str(), BSON( "name" << from ), false, false, true );
+ // oldSpec variable no longer valid memory
+
+ BSONObj oldIndexSpec;
+ s = database;
+ s += ".system.indexes";
+ while( Helpers::findOne( s.c_str(), BSON( "ns" << from ), oldIndexSpec ) ) {
+ BSONObjBuilder newIndexSpecB;
+ BSONObjIterator i( oldIndexSpec );
+ while( i.more() ) {
+ BSONElement e = i.next();
+ if ( strcmp( e.fieldName(), "ns" ) != 0 )
+ newIndexSpecB.append( e );
+ else
+ newIndexSpecB << "ns" << to;
+ }
+ BSONObj newIndexSpec = newIndexSpecB.done();
+ DiskLoc newIndexSpecLoc = theDataFileMgr.insert( s.c_str(), newIndexSpec.objdata(), newIndexSpec.objsize(), true, false );
+ int indexI = details->findIndexByName( oldIndexSpec.getStringField( "name" ) );
+ IndexDetails &indexDetails = details->idx(indexI);
+ string oldIndexNs = indexDetails.indexNamespace();
+ indexDetails.info = newIndexSpecLoc;
+ string newIndexNs = indexDetails.indexNamespace();
+
+ renameIndexNamespace( oldIndexNs.c_str(), newIndexNs.c_str() );
+ deleteObjects( s.c_str(), oldIndexSpec.getOwned(), true, false, true );
+ }
+ }
+
+ bool legalClientSystemNS( const string& ns , bool write ) {
+ if( ns == "local.system.replset" ) return true;
+
+ if ( ns.find( ".system.users" ) != string::npos )
+ return true;
+
+ if ( ns.find( ".system.js" ) != string::npos ) {
+ if ( write )
+ Scope::storedFuncMod();
+ return true;
+ }
+
+ return false;
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/namespace.h b/src/mongo/db/namespace.h
new file mode 100644
index 00000000000..9ceb6a6f4e9
--- /dev/null
+++ b/src/mongo/db/namespace.h
@@ -0,0 +1,629 @@
+// namespace.h
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "namespacestring.h"
+#include "jsobj.h"
+#include "querypattern.h"
+#include "diskloc.h"
+#include "../util/hashtab.h"
+#include "mongommf.h"
+#include "d_concurrency.h"
+
+namespace mongo {
+
+ class Database;
+
+#pragma pack(1)
+ /* This helper class is used to make the HashMap below in NamespaceIndex e.g. see line:
+ HashTable<Namespace,NamespaceDetails> *ht;
+ */
+ class Namespace {
+ public:
+ explicit Namespace(const char *ns) { *this = ns; }
+ Namespace& operator=(const char *ns);
+
+ bool hasDollarSign() const { return strchr( buf , '$' ) > 0; }
+ void kill() { buf[0] = 0x7f; }
+ bool operator==(const char *r) const { return strcmp(buf, r) == 0; }
+ bool operator==(const Namespace& r) const { return strcmp(buf, r.buf) == 0; }
+ int hash() const; // value returned is always > 0
+
+ size_t size() const { return strlen( buf ); }
+
+ string toString() const { return (string) buf; }
+ operator string() const { return (string) buf; }
+
+ /* NamespaceDetails::Extra was added after fact to allow chaining of data blocks to support more than 10 indexes
+ (more than 10 IndexDetails). It's a bit hacky because of this late addition with backward
+ file support. */
+ string extraName(int i) const;
+ bool isExtra() const; /* ends with $extr... -- when true an extra block not a normal NamespaceDetails block */
+
+ /** ( foo.bar ).getSisterNS( "blah" ) == foo.blah
+ perhaps this should move to the NamespaceString helper?
+ */
+ string getSisterNS( const char * local ) const;
+
+ enum MaxNsLenValue { MaxNsLen = 128 };
+ private:
+ char buf[MaxNsLen];
+ };
+#pragma pack()
+
+} // namespace mongo
+
+#include "index.h"
+
+namespace mongo {
+
+ /** @return true if a client can modify this namespace even though it is under ".system."
+ For example <dbname>.system.users is ok for regular clients to update.
+ @param write used when .system.js
+ */
+ bool legalClientSystemNS( const string& ns , bool write );
+
+ /* deleted lists -- linked lists of deleted records -- are placed in 'buckets' of various sizes
+ so you can look for a deleterecord about the right size.
+ */
+ const int Buckets = 19;
+ const int MaxBucket = 18;
+
+ extern int bucketSizes[];
+
+#pragma pack(1)
+ /* NamespaceDetails : this is the "header" for a collection that has all its details.
+ It's in the .ns file and this is a memory mapped region (thus the pack pragma above).
+ */
+ class NamespaceDetails {
+ public:
+ enum { NIndexesMax = 64, NIndexesExtra = 30, NIndexesBase = 10 };
+
+ /*-------- data fields, as present on disk : */
+ DiskLoc firstExtent;
+ DiskLoc lastExtent;
+ /* NOTE: capped collections v1 override the meaning of deletedList.
+ deletedList[0] points to a list of free records (DeletedRecord's) for all extents in
+ the capped namespace.
+ deletedList[1] points to the last record in the prev extent. When the "current extent"
+ changes, this value is updated. !deletedList[1].isValid() when this value is not
+ yet computed.
+ */
+ DiskLoc deletedList[Buckets];
+ // ofs 168 (8 byte aligned)
+ struct Stats {
+ // datasize and nrecords MUST Be adjacent code assumes!
+ long long datasize; // this includes padding, but not record headers
+ long long nrecords;
+ } stats;
+ int lastExtentSize;
+ int nIndexes;
+ private:
+ // ofs 192
+ IndexDetails _indexes[NIndexesBase];
+ public:
+ // ofs 352 (16 byte aligned)
+ int capped;
+ int max; // max # of objects for a capped table. TODO: should this be 64 bit?
+ double paddingFactor; // 1.0 = no padding.
+ // ofs 386 (16)
+ int flags;
+ DiskLoc capExtent;
+ DiskLoc capFirstNewRecord;
+ unsigned short dataFileVersion; // NamespaceDetails version. So we can do backward compatibility in the future. See filever.h
+ unsigned short indexFileVersion;
+ unsigned long long multiKeyIndexBits;
+ private:
+ // ofs 400 (16)
+ unsigned long long reservedA;
+ long long extraOffset; // where the $extra info is located (bytes relative to this)
+ public:
+ int indexBuildInProgress; // 1 if in prog
+ unsigned reservedB;
+ // ofs 424 (8)
+ struct Capped2 {
+ unsigned long long cc2_ptr; // see capped.cpp
+ unsigned fileNumber;
+ } capped2;
+ char reserved[60];
+ /*-------- end data 496 bytes */
+
+ explicit NamespaceDetails( const DiskLoc &loc, bool _capped );
+
+ class Extra {
+ long long _next;
+ public:
+ IndexDetails details[NIndexesExtra];
+ private:
+ unsigned reserved2;
+ unsigned reserved3;
+ Extra(const Extra&) { assert(false); }
+ Extra& operator=(const Extra& r) { assert(false); return *this; }
+ public:
+ Extra() { }
+ long ofsFrom(NamespaceDetails *d) {
+ return ((char *) this) - ((char *) d);
+ }
+ void init() { memset(this, 0, sizeof(Extra)); }
+ Extra* next(NamespaceDetails *d) {
+ if( _next == 0 ) return 0;
+ return (Extra*) (((char *) d) + _next);
+ }
+ void setNext(long ofs) { *getDur().writing(&_next) = ofs; }
+ void copy(NamespaceDetails *d, const Extra& e) {
+ memcpy(this, &e, sizeof(Extra));
+ _next = 0;
+ }
+ };
+ Extra* extra() {
+ if( extraOffset == 0 ) return 0;
+ return (Extra *) (((char *) this) + extraOffset);
+ }
+ /* add extra space for indexes when more than 10 */
+ Extra* allocExtra(const char *ns, int nindexessofar);
+ void copyingFrom(const char *thisns, NamespaceDetails *src); // must be called when renaming a NS to fix up extra
+
+ /* called when loaded from disk */
+ void onLoad(const Namespace& k);
+
+ /* dump info on this namespace. for debugging. */
+ void dump(const Namespace& k);
+
+ /* dump info on all extents for this namespace. for debugging. */
+ void dumpExtents();
+
+ private:
+ Extent *theCapExtent() const { return capExtent.ext(); }
+ void advanceCapExtent( const char *ns );
+ DiskLoc __capAlloc(int len);
+ DiskLoc cappedAlloc(const char *ns, int len);
+ DiskLoc &cappedFirstDeletedInCurExtent();
+ bool nextIsInCapExtent( const DiskLoc &dl ) const;
+
+ public:
+ DiskLoc& cappedListOfAllDeletedRecords() { return deletedList[0]; }
+ DiskLoc& cappedLastDelRecLastExtent() { return deletedList[1]; }
+ void cappedDumpDelInfo();
+ bool capLooped() const { return capped && capFirstNewRecord.isValid(); }
+ bool inCapExtent( const DiskLoc &dl ) const;
+ void cappedCheckMigrate();
+ /**
+ * Truncate documents newer than the document at 'end' from the capped
+ * collection. The collection cannot be completely emptied using this
+ * function. An assertion will be thrown if that is attempted.
+ * @param inclusive - Truncate 'end' as well iff true
+ */
+ void cappedTruncateAfter(const char *ns, DiskLoc end, bool inclusive);
+ /** Remove all documents from the capped collection */
+ void emptyCappedCollection(const char *ns);
+
+ /* when a background index build is in progress, we don't count the index in nIndexes until
+ complete, yet need to still use it in _indexRecord() - thus we use this function for that.
+ */
+ int nIndexesBeingBuilt() const { return nIndexes + indexBuildInProgress; }
+
+ /* NOTE: be careful with flags. are we manipulating them in read locks? if so,
+ this isn't thread safe. TODO
+ */
+ enum NamespaceFlags {
+ Flag_HaveIdIndex = 1 << 0 // set when we have _id index (ONLY if ensureIdIndex was called -- 0 if that has never been called)
+ };
+
+ IndexDetails& idx(int idxNo, bool missingExpected = false );
+
+ /** get the IndexDetails for the index currently being built in the background. (there is at most one) */
+ IndexDetails& inProgIdx() {
+ DEV assert(indexBuildInProgress);
+ return idx(nIndexes);
+ }
+
+ class IndexIterator {
+ public:
+ int pos() { return i; } // note this is the next one to come
+ bool more() { return i < n; }
+ IndexDetails& next() { return d->idx(i++); }
+ private:
+ friend class NamespaceDetails;
+ int i, n;
+ NamespaceDetails *d;
+ IndexIterator(NamespaceDetails *_d);
+ };
+
+ IndexIterator ii() { return IndexIterator(this); }
+
+ /* hackish - find our index # in the indexes array */
+ int idxNo(IndexDetails& idx);
+
+ /* multikey indexes are indexes where there are more than one key in the index
+ for a single document. see multikey in wiki.
+ for these, we have to do some dedup work on queries.
+ */
+ bool isMultikey(int i) const { return (multiKeyIndexBits & (((unsigned long long) 1) << i)) != 0; }
+ void setIndexIsMultikey(int i) {
+ dassert( i < NIndexesMax );
+ unsigned long long x = ((unsigned long long) 1) << i;
+ if( multiKeyIndexBits & x ) return;
+ *getDur().writing(&multiKeyIndexBits) |= x;
+ }
+ void clearIndexIsMultikey(int i) {
+ dassert( i < NIndexesMax );
+ unsigned long long x = ((unsigned long long) 1) << i;
+ if( (multiKeyIndexBits & x) == 0 ) return;
+ *getDur().writing(&multiKeyIndexBits) &= ~x;
+ }
+
+ /* add a new index. does not add to system.indexes etc. - just to NamespaceDetails.
+ caller must populate returned object.
+ */
+ IndexDetails& addIndex(const char *thisns, bool resetTransient=true);
+
+ void aboutToDeleteAnIndex() {
+ *getDur().writing(&flags) = flags & ~Flag_HaveIdIndex;
+ }
+
+ /* returns index of the first index in which the field is present. -1 if not present. */
+ int fieldIsIndexed(const char *fieldName);
+
+ /* called to indicate that an update fit in place.
+ fits also called on an insert -- idea there is that if you had some mix and then went to
+ pure inserts it would adapt and PF would trend to 1.0. note update calls insert on a move
+ so there is a double count there that must be adjusted for below.
+
+ todo: greater sophistication could be helpful and added later. for example the absolute
+ size of documents might be considered -- in some cases smaller ones are more likely
+ to grow than larger ones in the same collection? (not always)
+ */
+ void paddingFits() {
+ MONGO_SOMETIMES(sometimes, 4) { // do this on a sampled basis to journal less
+ double x = paddingFactor - 0.001;
+ if ( x >= 1.0 ) {
+ *getDur().writing(&paddingFactor) = x;
+ //getDur().setNoJournal(&paddingFactor, &x, sizeof(x));
+ }
+ }
+ }
+ void paddingTooSmall() {
+ MONGO_SOMETIMES(sometimes, 4) { // do this on a sampled basis to journal less
+ /* the more indexes we have, the higher the cost of a move. so we take that into
+ account herein. note on a move that insert() calls paddingFits(), thus
+ here for example with no inserts and nIndexes = 1 we have
+ .001*4-.001 or a 3:1 ratio to non moves -> 75% nonmoves. insert heavy
+ can pushes this down considerably. further tweaking will be a good idea but
+ this should be an adequate starting point.
+ */
+ double N = min(nIndexes,7) + 3;
+ double x = paddingFactor + (0.001 * N);
+ if ( x <= 2.0 ) {
+ *getDur().writing(&paddingFactor) = x;
+ //getDur().setNoJournal(&paddingFactor, &x, sizeof(x));
+ }
+ }
+ }
+
+ // @return offset in indexes[]
+ int findIndexByName(const char *name);
+
+ // @return offset in indexes[]
+ int findIndexByKeyPattern(const BSONObj& keyPattern);
+
+ void findIndexByType( const string& name , vector<int>& matches ) {
+ IndexIterator i = ii();
+ while ( i.more() ) {
+ if ( i.next().getSpec().getTypeName() == name )
+ matches.push_back( i.pos() - 1 );
+ }
+ }
+
+ /* @return -1 = not found
+ generally id is first index, so not that expensive an operation (assuming present).
+ */
+ int findIdIndex() {
+ IndexIterator i = ii();
+ while( i.more() ) {
+ if( i.next().isIdIndex() )
+ return i.pos()-1;
+ }
+ return -1;
+ }
+
+ bool haveIdIndex() {
+ return (flags & NamespaceDetails::Flag_HaveIdIndex) || findIdIndex() >= 0;
+ }
+
+ /* return which "deleted bucket" for this size object */
+ static int bucket(int n) {
+ for ( int i = 0; i < Buckets; i++ )
+ if ( bucketSizes[i] > n )
+ return i;
+ return Buckets-1;
+ }
+
+ /* predetermine location of the next alloc without actually doing it.
+ if cannot predetermine returns null (so still call alloc() then)
+ */
+ DiskLoc allocWillBeAt(const char *ns, int lenToAlloc);
+
+ /* allocate a new record. lenToAlloc includes headers. */
+ DiskLoc alloc(const char *ns, int lenToAlloc, DiskLoc& extentLoc);
+
+ /* add a given record to the deleted chains for this NS */
+ void addDeletedRec(DeletedRecord *d, DiskLoc dloc);
+ void dumpDeleted(set<DiskLoc> *extents = 0);
+ // Start from firstExtent by default.
+ DiskLoc firstRecord( const DiskLoc &startExtent = DiskLoc() ) const;
+ // Start from lastExtent by default.
+ DiskLoc lastRecord( const DiskLoc &startExtent = DiskLoc() ) const;
+ long long storageSize( int * numExtents = 0 , BSONArrayBuilder * extentInfo = 0 ) const;
+
+ int averageObjectSize() {
+ if ( stats.nrecords == 0 )
+ return 5;
+ return (int) (stats.datasize / stats.nrecords);
+ }
+
+ NamespaceDetails *writingWithoutExtra() {
+ return ( NamespaceDetails* ) getDur().writingPtr( this, sizeof( NamespaceDetails ) );
+ }
+ /** Make all linked Extra objects writeable as well */
+ NamespaceDetails *writingWithExtra();
+
+ private:
+ DiskLoc _alloc(const char *ns, int len);
+ void maybeComplain( const char *ns, int len ) const;
+ DiskLoc __stdAlloc(int len, bool willBeAt);
+ void compact(); // combine adjacent deleted records
+ friend class NamespaceIndex;
+ struct ExtraOld {
+ // note we could use this field for more chaining later, so don't waste it:
+ unsigned long long reserved1;
+ IndexDetails details[NIndexesExtra];
+ unsigned reserved2;
+ unsigned reserved3;
+ };
+ /** Update cappedLastDelRecLastExtent() after capExtent changed in cappedTruncateAfter() */
+ void cappedTruncateLastDelUpdate();
+ BOOST_STATIC_ASSERT( NIndexesMax <= NIndexesBase + NIndexesExtra*2 );
+ BOOST_STATIC_ASSERT( NIndexesMax <= 64 ); // multiKey bits
+ BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::ExtraOld) == 496 );
+ BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) == 496 );
+ }; // NamespaceDetails
+#pragma pack()
+
+ /* NamespaceDetailsTransient
+
+ these are things we know / compute about a namespace that are transient -- things
+ we don't actually store in the .ns file. so mainly caching of frequently used
+ information.
+
+ CAUTION: Are you maintaining this properly on a collection drop()? A dropdatabase()? Be careful.
+ The current field "allIndexKeys" may have too many keys in it on such an occurrence;
+ as currently used that does not cause anything terrible to happen.
+
+ todo: cleanup code, need abstractions and separation
+ */
+ // todo: multiple db's with the same name (repairDatbase) is not handled herein. that may be
+ // the way to go, if not used by repair, but need some sort of enforcement / asserts.
+ class NamespaceDetailsTransient : boost::noncopyable {
+ BOOST_STATIC_ASSERT( sizeof(NamespaceDetails) == 496 );
+
+ //Database *database;
+ const string _ns;
+ void reset();
+ static std::map< string, shared_ptr< NamespaceDetailsTransient > > _nsdMap;
+
+ NamespaceDetailsTransient(Database*,const char *ns);
+ public:
+ ~NamespaceDetailsTransient();
+ void addedIndex() { assertInWriteLock(); reset(); }
+ void deletedIndex() { assertInWriteLock(); reset(); }
+ /* Drop cached information on all namespaces beginning with the specified prefix.
+ Can be useful as index namespaces share the same start as the regular collection.
+ SLOW - sequential scan of all NamespaceDetailsTransient objects */
+ static void clearForPrefix(const char *prefix);
+ static void eraseForPrefix(const char *prefix);
+
+ /**
+ * @return a cursor interface to the query optimizer. The implementation may
+ * utilize a single query plan or interleave results from multiple query
+ * plans before settling on a single query plan. Note that the schema of
+ * currKey() documents, the matcher(), and the isMultiKey() nature of the
+ * cursor may change over the course of iteration.
+ *
+ * @param query - Query used to select indexes and populate matchers.
+ *
+ * @param order - Required ordering spec for documents produced by this cursor,
+ * empty object default indicates no order requirement. If no index exists that
+ * satisfies the required sort order, an empty shared_ptr is returned.
+ *
+ * @param requireIndex - If true, no unindexed (ie collection scan) cursors are
+ * used to generate the returned cursor. If an unindexed cursor is required, an
+ * assertion is raised by the cursor during iteration.
+ *
+ * @param simpleEqualityMatch - Set to true for certain simple queries -
+ * see queryoptimizer.cpp.
+ *
+ * The returned cursor may @throw inside of advance() or recoverFromYield() in
+ * certain error cases, for example if a capped overrun occurred during a yield.
+ * This indicates that the cursor was unable to perform a complete scan.
+ *
+ * This is a work in progress. Partial list of features not yet implemented:
+ * - covered indexes
+ * - in memory sorting
+ */
+ static shared_ptr<Cursor> getCursor( const char *ns, const BSONObj &query,
+ const BSONObj &order = BSONObj(), bool requireIndex = false,
+ bool *simpleEqualityMatch = 0 );
+
+ /* indexKeys() cache ---------------------------------------------------- */
+ /* assumed to be in write lock for this */
+ private:
+ bool _keysComputed;
+ set<string> _indexKeys;
+ void computeIndexKeys();
+ public:
+ /* get set of index keys for this namespace. handy to quickly check if a given
+ field is indexed (Note it might be a secondary component of a compound index.)
+ */
+ set<string>& indexKeys() {
+ DEV assertInWriteLock();
+ if ( !_keysComputed )
+ computeIndexKeys();
+ return _indexKeys;
+ }
+
+ /* IndexSpec caching */
+ private:
+ map<const IndexDetails*,IndexSpec> _indexSpecs;
+ static SimpleMutex _isMutex;
+ public:
+ const IndexSpec& getIndexSpec( const IndexDetails * details ) {
+ IndexSpec& spec = _indexSpecs[details];
+ if ( ! spec._finishedInit ) {
+ SimpleMutex::scoped_lock lk(_isMutex);
+ if ( ! spec._finishedInit ) {
+ spec.reset( details );
+ assert( spec._finishedInit );
+ }
+ }
+ return spec;
+ }
+
+ /* query cache (for query optimizer) ------------------------------------- */
+ private:
+ int _qcWriteCount;
+ map< QueryPattern, pair< BSONObj, long long > > _qcCache;
+ static NamespaceDetailsTransient& make_inlock(const char *ns);
+ public:
+ static SimpleMutex _qcMutex;
+
+ /* you must be in the qcMutex when calling this.
+ A NamespaceDetailsTransient object will not go out of scope on you if you are
+ d.dbMutex.atLeastReadLocked(), so you do't have to stay locked.
+ Creates a NamespaceDetailsTransient before returning if one DNE.
+ todo: avoid creating too many on erroneous ns queries.
+ */
+ static NamespaceDetailsTransient& get_inlock(const char *ns);
+
+ static NamespaceDetailsTransient& get(const char *ns) {
+ SimpleMutex::scoped_lock lk(_qcMutex);
+ return get_inlock(ns);
+ }
+
+ void clearQueryCache() { // public for unit tests
+ _qcCache.clear();
+ _qcWriteCount = 0;
+ }
+ /* you must notify the cache if you are doing writes, as query plan optimality will change */
+ void notifyOfWriteOp() {
+ if ( _qcCache.empty() )
+ return;
+ if ( ++_qcWriteCount >= 100 )
+ clearQueryCache();
+ }
+ BSONObj indexForPattern( const QueryPattern &pattern ) {
+ return _qcCache[ pattern ].first;
+ }
+ long long nScannedForPattern( const QueryPattern &pattern ) {
+ return _qcCache[ pattern ].second;
+ }
+ void registerIndexForPattern( const QueryPattern &pattern, const BSONObj &indexKey, long long nScanned ) {
+ _qcCache[ pattern ] = make_pair( indexKey, nScanned );
+ }
+
+ }; /* NamespaceDetailsTransient */
+
+ inline NamespaceDetailsTransient& NamespaceDetailsTransient::get_inlock(const char *ns) {
+ std::map< string, shared_ptr< NamespaceDetailsTransient > >::iterator i = _nsdMap.find(ns);
+ if( i != _nsdMap.end() &&
+ i->second.get() ) { // could be null ptr from clearForPrefix
+ return *i->second;
+ }
+ return make_inlock(ns);
+ }
+
+ /* NamespaceIndex is the ".ns" file you see in the data directory. It is the "system catalog"
+ if you will: at least the core parts. (Additional info in system.* collections.)
+ */
+ class NamespaceIndex {
+ public:
+ NamespaceIndex(const string &dir, const string &database) :
+ ht( 0 ), dir_( dir ), database_( database ) {}
+
+ /* returns true if new db will be created if we init lazily */
+ bool exists() const;
+
+ void init() {
+ if( !ht )
+ _init();
+ }
+
+ void add_ns(const char *ns, DiskLoc& loc, bool capped);
+ void add_ns( const char *ns, const NamespaceDetails &details );
+
+ NamespaceDetails* details(const char *ns) {
+ if ( !ht )
+ return 0;
+ Namespace n(ns);
+ NamespaceDetails *d = ht->get(n);
+ if ( d && d->capped )
+ d->cappedCheckMigrate();
+ return d;
+ }
+
+ void kill_ns(const char *ns);
+
+ bool find(const char *ns, DiskLoc& loc) {
+ NamespaceDetails *l = details(ns);
+ if ( l ) {
+ loc = l->firstExtent;
+ return true;
+ }
+ return false;
+ }
+
+ bool allocated() const { return ht != 0; }
+
+ void getNamespaces( list<string>& tofill , bool onlyCollections = true ) const;
+
+ NamespaceDetails::Extra* newExtra(const char *ns, int n, NamespaceDetails *d);
+
+ boost::filesystem::path path() const;
+
+ unsigned long long fileLength() const { return f.length(); }
+
+ private:
+ void _init();
+ void maybeMkdir() const;
+
+ MongoMMF f;
+ HashTable<Namespace,NamespaceDetails> *ht;
+ string dir_;
+ string database_;
+ };
+
+ extern string dbpath; // --dbpath parm
+ extern bool directoryperdb;
+
+ // Rename a namespace within current 'client' db.
+ // (Arguments should include db name)
+ void renameNamespace( const char *from, const char *to );
+
+
+} // namespace mongo
diff --git a/src/mongo/db/namespacestring.h b/src/mongo/db/namespacestring.h
new file mode 100644
index 00000000000..d982c5fff75
--- /dev/null
+++ b/src/mongo/db/namespacestring.h
@@ -0,0 +1,147 @@
+// @file namespacestring.h
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <string>
+
+namespace mongo {
+
+ using std::string;
+
+ /* in the mongo source code, "client" means "database". */
+
+ const int MaxDatabaseNameLen = 256; // max str len for the db name, including null char
+
+ /* e.g.
+ NamespaceString ns("acme.orders");
+ cout << ns.coll; // "orders"
+ */
+ class NamespaceString {
+ public:
+ string db;
+ string coll; // note collection names can have periods in them for organizing purposes (e.g. "system.indexes")
+
+ NamespaceString( const char * ns ) { init(ns); }
+ NamespaceString( const string& ns ) { init(ns.c_str()); }
+
+ string ns() const { return db + '.' + coll; }
+
+ bool isSystem() const { return strncmp(coll.c_str(), "system.", 7) == 0; }
+ bool isCommand() const { return coll == "$cmd"; }
+
+ operator string() const { return ns(); }
+
+ bool operator==( const string& nsIn ) const { return nsIn == ns(); }
+ bool operator==( const char* nsIn ) const { return (string)nsIn == ns(); }
+ bool operator==( const NamespaceString& nsIn ) const { return nsIn.db == db && nsIn.coll == coll; }
+
+ bool operator!=( const string& nsIn ) const { return nsIn != ns(); }
+ bool operator!=( const char* nsIn ) const { return (string)nsIn != ns(); }
+ bool operator!=( const NamespaceString& nsIn ) const { return nsIn.db != db || nsIn.coll != coll; }
+
+ string toString() const { return ns(); }
+
+ /**
+ * @return true if ns is 'normal'. $ used for collections holding index data, which do not contain BSON objects in their records.
+ * special case for the local.oplog.$main ns -- naming it as such was a mistake.
+ */
+ static bool normal(const char* ns) {
+ const char *p = strchr(ns, '$');
+ if( p == 0 )
+ return true;
+ return strcmp( ns, "local.oplog.$main" ) == 0;
+ }
+
+ static bool special(const char *ns) {
+ return !normal(ns) || strstr(ns, ".system.");
+ }
+
+ /**
+ * samples:
+ * good:
+ * foo
+ * bar
+ * foo-bar
+ * bad:
+ * foo bar
+ * foo.bar
+ * foo"bar
+ *
+ * @param db - a possible database name
+ * @return if db is an allowed database name
+ */
+ static bool validDBName( const string& db ) {
+ if ( db.size() == 0 || db.size() > 64 )
+ return false;
+ size_t good = strcspn( db.c_str() , "/\\. \"" );
+ return good == db.size();
+ }
+
+ /**
+ * samples:
+ * good:
+ * foo.bar
+ * bad:
+ * foo.
+ *
+ * @param dbcoll - a possible collection name of the form db.coll
+ * @return if db.coll is an allowed collection name
+ */
+ static bool validCollectionName(const char* dbcoll){
+ const char *c = strchr( dbcoll, '.' ) + 1;
+ return normal(dbcoll) && c && *c;
+ }
+
+ private:
+ void init(const char *ns) {
+ const char *p = strchr(ns, '.');
+ if( p == 0 ) return;
+ db = string(ns, p - ns);
+ coll = p + 1;
+ }
+ };
+
+ // "database.a.b.c" -> "database"
+ inline void nsToDatabase(const char *ns, char *database) {
+ const char *p = ns;
+ char *q = database;
+ while ( *p != '.' ) {
+ if ( *p == 0 )
+ break;
+ *q++ = *p++;
+ }
+ *q = 0;
+ if (q-database>=MaxDatabaseNameLen) {
+ log() << "nsToDatabase: ns too long. terminating, buf overrun condition" << endl;
+ dbexit( EXIT_POSSIBLE_CORRUPTION );
+ }
+ }
+ inline string nsToDatabase(const char *ns) {
+ char buf[MaxDatabaseNameLen];
+ nsToDatabase(ns, buf);
+ return buf;
+ }
+ inline string nsToDatabase(const string& ns) {
+ size_t i = ns.find( '.' );
+ if ( i == string::npos )
+ return ns;
+ return ns.substr( 0 , i );
+ }
+
+}
diff --git a/src/mongo/db/nonce.cpp b/src/mongo/db/nonce.cpp
new file mode 100644
index 00000000000..379e88f116d
--- /dev/null
+++ b/src/mongo/db/nonce.cpp
@@ -0,0 +1,95 @@
+// nonce.cpp
+
+/* Copyright 2009 10gen Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pch.h"
+#include "nonce.h"
+#include "../util/time_support.h"
+
+extern int do_md5_test(void);
+
+namespace mongo {
+
+ BOOST_STATIC_ASSERT( sizeof(nonce64) == 8 );
+
+ static Security security; // needs to be static so _initialized is preset to false (see initsafe below)
+
+ Security::Security() {
+ static int n;
+ massert( 10352 , "Security is a singleton class", ++n == 1);
+ init();
+ }
+
+ NOINLINE_DECL void Security::init() {
+ if( _initialized ) return;
+ _initialized = true;
+
+#if defined(__linux__) || defined(__sunos__) || defined(__APPLE__)
+ _devrandom = new ifstream("/dev/urandom", ios::binary|ios::in);
+ massert( 10353 , "can't open dev/urandom", _devrandom->is_open() );
+#elif defined(_WIN32)
+ srand(curTimeMicros()); // perhaps not relevant for rand_s but we might want elsewhere anyway
+#else
+ srandomdev();
+#endif
+
+#ifndef NDEBUG
+ if ( do_md5_test() )
+ massert( 10354 , "md5 unit test fails", false);
+#endif
+ }
+
+ nonce64 Security::__getNonce() {
+ dassert( _initialized );
+ nonce64 n;
+#if defined(__linux__) || defined(__sunos__) || defined(__APPLE__)
+ _devrandom->read((char*)&n, sizeof(n));
+ massert(10355 , "devrandom failed", !_devrandom->fail());
+#elif defined(_WIN32)
+ unsigned a=0, b=0;
+ assert( rand_s(&a) == 0 );
+ assert( rand_s(&b) == 0 );
+ n = (((unsigned long long)a)<<32) | b;
+#else
+ n = (((unsigned long long)random())<<32) | random();
+#endif
+ return n;
+ }
+
+ SimpleMutex nonceMutex("nonce");
+ nonce64 Security::_getNonce() {
+ // not good this is a static as gcc will mutex protect it which costs time
+ SimpleMutex::scoped_lock lk(nonceMutex);
+ if( !_initialized )
+ init();
+ return __getNonce();
+ }
+
+ nonce64 Security::getNonceDuringInit() {
+ // the mutex might not be inited yet. init phase should be one thread anyway (hopefully we don't spawn threads therein)
+ if( !security._initialized )
+ security.init();
+ return security.__getNonce();
+ }
+
+ nonce64 Security::getNonce() {
+ return security._getNonce();
+ }
+
+ // name warns us this might be a little slow (see code above)
+ unsigned goodRandomNumberSlow() { return (unsigned) Security::getNonce(); }
+
+} // namespace mongo
diff --git a/src/mongo/db/nonce.h b/src/mongo/db/nonce.h
new file mode 100644
index 00000000000..d6a147ab1c0
--- /dev/null
+++ b/src/mongo/db/nonce.h
@@ -0,0 +1,36 @@
+// @file nonce.h
+
+/* Copyright 2009 10gen Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace mongo {
+
+ typedef unsigned long long nonce64;
+
+ struct Security {
+ Security();
+ static nonce64 getNonce();
+ static nonce64 getNonceDuringInit(); // use this version during global var constructors
+ private:
+ nonce64 _getNonce();
+ nonce64 __getNonce();
+ ifstream *_devrandom;
+ bool _initialized;
+ void init(); // can call more than once
+ };
+
+} // namespace mongo
diff --git a/src/mongo/db/oplog.cpp b/src/mongo/db/oplog.cpp
new file mode 100644
index 00000000000..342f362a28f
--- /dev/null
+++ b/src/mongo/db/oplog.cpp
@@ -0,0 +1,872 @@
+// @file oplog.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "oplog.h"
+#include "repl_block.h"
+#include "repl.h"
+#include "commands.h"
+#include "repl/rs.h"
+#include "stats/counters.h"
+#include "../util/file.h"
+#include "../util/unittest.h"
+#include "queryoptimizer.h"
+#include "ops/update.h"
+#include "ops/delete.h"
+#include "ops/query.h"
+
+namespace mongo {
+
+ void logOpForSharding( const char * opstr , const char * ns , const BSONObj& obj , BSONObj * patt );
+
+ int __findingStartInitialTimeout = 5; // configurable for testing
+
+ // cached copies of these...so don't rename them, drop them, etc.!!!
+ static NamespaceDetails *localOplogMainDetails = 0;
+ static Database *localDB = 0;
+ static NamespaceDetails *rsOplogDetails = 0;
+ void oplogCheckCloseDatabase( Database * db ) {
+ localDB = 0;
+ localOplogMainDetails = 0;
+ rsOplogDetails = 0;
+ resetSlaveCache();
+ }
+
+ static void _logOpUninitialized(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb ) {
+ uassert(13288, "replSet error write op to db before replSet initialized", str::startsWith(ns, "local.") || *opstr == 'n');
+ }
+
+ /** write an op to the oplog that is already built.
+ todo : make _logOpRS() call this so we don't repeat ourself?
+ */
+ void _logOpObjRS(const BSONObj& op) {
+ DEV assertInWriteLock();
+
+ const OpTime ts = op["ts"]._opTime();
+ long long h = op["h"].numberLong();
+
+ {
+ const char *logns = rsoplog;
+ if ( rsOplogDetails == 0 ) {
+ Client::Context ctx( logns , dbpath, false);
+ localDB = ctx.db();
+ assert( localDB );
+ rsOplogDetails = nsdetails(logns);
+ massert(13389, "local.oplog.rs missing. did you drop it? if so restart server", rsOplogDetails);
+ }
+ Client::Context ctx( logns , localDB, false );
+ {
+ int len = op.objsize();
+ Record *r = theDataFileMgr.fast_oplog_insert(rsOplogDetails, logns, len);
+ memcpy(getDur().writingPtr(r->data, len), op.objdata(), len);
+ }
+ /* todo: now() has code to handle clock skew. but if the skew server to server is large it will get unhappy.
+ this code (or code in now() maybe) should be improved.
+ */
+ if( theReplSet ) {
+ if( !(theReplSet->lastOpTimeWritten<ts) ) {
+ log() << "replSet error possible failover clock skew issue? " << theReplSet->lastOpTimeWritten.toString() << ' ' << endl;
+ }
+ theReplSet->lastOpTimeWritten = ts;
+ theReplSet->lastH = h;
+ ctx.getClient()->setLastOp( ts );
+ }
+ }
+ }
+
+ /** given a BSON object, create a new one at dst which is the existing (partial) object
+ with a new object element appended at the end with fieldname "o".
+
+ @param partial already build object with everything except the o member. e.g. something like:
+ { ts:..., ns:..., os2:... }
+ @param o a bson object to be added with fieldname "o"
+ @dst where to put the newly built combined object. e.g. ends up as something like:
+ { ts:..., ns:..., os2:..., o:... }
+ */
+ void append_O_Obj(char *dst, const BSONObj& partial, const BSONObj& o) {
+ const int size1 = partial.objsize() - 1; // less the EOO char
+ const int oOfs = size1+3; // 3 = byte BSONOBJTYPE + byte 'o' + byte \0
+
+ void *p = getDur().writingPtr(dst, oOfs+o.objsize()+1);
+
+ memcpy(p, partial.objdata(), size1);
+
+ // adjust overall bson object size for the o: field
+ *(static_cast<unsigned*>(p)) += o.objsize() + 1/*fieldtype byte*/ + 2/*"o" fieldname*/;
+
+ char *b = static_cast<char *>(p);
+ b += size1;
+ *b++ = (char) Object;
+ *b++ = 'o'; // { o : ... }
+ *b++ = 0; // null terminate "o" fieldname
+ memcpy(b, o.objdata(), o.objsize());
+ b += o.objsize();
+ *b = EOO;
+ }
+
+ // global is safe as we are in write lock. we put the static outside the function to avoid the implicit mutex
+ // the compiler would use if inside the function. the reason this is static is to avoid a malloc/free for this
+ // on every logop call.
+ static BufBuilder logopbufbuilder(8*1024);
+ static void _logOpRS(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb ) {
+ DEV assertInWriteLock();
+
+ if ( strncmp(ns, "local.", 6) == 0 ) {
+ if ( strncmp(ns, "local.slaves", 12) == 0 )
+ resetSlaveCache();
+ return;
+ }
+
+ const OpTime ts = OpTime::now();
+ long long hashNew;
+ if( theReplSet ) {
+ massert(13312, "replSet error : logOp() but not primary?", theReplSet->box.getState().primary());
+ hashNew = (theReplSet->lastH * 131 + ts.asLL()) * 17 + theReplSet->selfId();
+ }
+ else {
+ // must be initiation
+ assert( *ns == 0 );
+ hashNew = 0;
+ }
+
+ /* we jump through a bunch of hoops here to avoid copying the obj buffer twice --
+ instead we do a single copy to the destination position in the memory mapped file.
+ */
+
+ logopbufbuilder.reset();
+ BSONObjBuilder b(logopbufbuilder);
+ b.appendTimestamp("ts", ts.asDate());
+ b.append("h", hashNew);
+ b.append("op", opstr);
+ b.append("ns", ns);
+ if ( bb )
+ b.appendBool("b", *bb);
+ if ( o2 )
+ b.append("o2", *o2);
+ BSONObj partial = b.done();
+ int posz = partial.objsize();
+ int len = posz + obj.objsize() + 1 + 2 /*o:*/;
+
+ Record *r;
+ DEV assert( logNS == 0 );
+ {
+ const char *logns = rsoplog;
+ if ( rsOplogDetails == 0 ) {
+ Client::Context ctx( logns , dbpath, false);
+ localDB = ctx.db();
+ assert( localDB );
+ rsOplogDetails = nsdetails(logns);
+ massert(13347, "local.oplog.rs missing. did you drop it? if so restart server", rsOplogDetails);
+ }
+ Client::Context ctx( logns , localDB, false );
+ r = theDataFileMgr.fast_oplog_insert(rsOplogDetails, logns, len);
+ /* todo: now() has code to handle clock skew. but if the skew server to server is large it will get unhappy.
+ this code (or code in now() maybe) should be improved.
+ */
+ if( theReplSet ) {
+ if( !(theReplSet->lastOpTimeWritten<ts) ) {
+ log() << "replSet ERROR possible failover clock skew issue? " << theReplSet->lastOpTimeWritten << ' ' << ts << rsLog;
+ log() << "replSet " << theReplSet->isPrimary() << rsLog;
+ }
+ theReplSet->lastOpTimeWritten = ts;
+ theReplSet->lastH = hashNew;
+ ctx.getClient()->setLastOp( ts );
+ }
+ }
+
+ append_O_Obj(r->data, partial, obj);
+
+ if ( logLevel >= 6 ) {
+ BSONObj temp(r);
+ log( 6 ) << "logOp:" << temp << endl;
+ }
+ }
+
+ /* we write to local.opload.$main:
+ { ts : ..., op: ..., ns: ..., o: ... }
+ ts: an OpTime timestamp
+ op:
+ "i" insert
+ "u" update
+ "d" delete
+ "c" db cmd
+ "db" declares presence of a database (ns is set to the db name + '.')
+ "n" no op
+ logNS - where to log it. 0/null means "local.oplog.$main".
+ bb:
+ if not null, specifies a boolean to pass along to the other side as b: param.
+ used for "justOne" or "upsert" flags on 'd', 'u'
+ first: true
+ when set, indicates this is the first thing we have logged for this database.
+ thus, the slave does not need to copy down all the data when it sees this.
+
+ note this is used for single collection logging even when --replSet is enabled.
+ */
+ static void _logOpOld(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb ) {
+ DEV assertInWriteLock();
+ static BufBuilder bufbuilder(8*1024);
+
+ if ( strncmp(ns, "local.", 6) == 0 ) {
+ if ( strncmp(ns, "local.slaves", 12) == 0 ) {
+ resetSlaveCache();
+ }
+ return;
+ }
+
+ const OpTime ts = OpTime::now();
+ Client::Context context("",0,false);
+
+ /* we jump through a bunch of hoops here to avoid copying the obj buffer twice --
+ instead we do a single copy to the destination position in the memory mapped file.
+ */
+
+ bufbuilder.reset();
+ BSONObjBuilder b(bufbuilder);
+ b.appendTimestamp("ts", ts.asDate());
+ b.append("op", opstr);
+ b.append("ns", ns);
+ if ( bb )
+ b.appendBool("b", *bb);
+ if ( o2 )
+ b.append("o2", *o2);
+ BSONObj partial = b.done(); // partial is everything except the o:... part.
+
+ int po_sz = partial.objsize();
+ int len = po_sz + obj.objsize() + 1 + 2 /*o:*/;
+
+ Record *r;
+ if( logNS == 0 ) {
+ logNS = "local.oplog.$main";
+ if ( localOplogMainDetails == 0 ) {
+ Client::Context ctx( logNS , dbpath, false);
+ localDB = ctx.db();
+ assert( localDB );
+ localOplogMainDetails = nsdetails(logNS);
+ assert( localOplogMainDetails );
+ }
+ Client::Context ctx( logNS , localDB, false );
+ r = theDataFileMgr.fast_oplog_insert(localOplogMainDetails, logNS, len);
+ }
+ else {
+ Client::Context ctx( logNS, dbpath, false );
+ assert( nsdetails( logNS ) );
+ // first we allocate the space, then we fill it below.
+ r = theDataFileMgr.fast_oplog_insert( nsdetails( logNS ), logNS, len);
+ }
+
+ append_O_Obj(r->data, partial, obj);
+
+ context.getClient()->setLastOp( ts );
+
+ if ( logLevel >= 6 ) {
+ BSONObj temp(r);
+ log( 6 ) << "logging op:" << temp << endl;
+ }
+
+ }
+
+ static void (*_logOp)(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb ) = _logOpOld;
+ void newReplUp() {
+ replSettings.master = true;
+ _logOp = _logOpRS;
+ }
+ void newRepl() {
+ replSettings.master = true;
+ _logOp = _logOpUninitialized;
+ }
+ void oldRepl() { _logOp = _logOpOld; }
+
+ void logKeepalive() {
+ _logOp("n", "", 0, BSONObj(), 0, 0);
+ }
+ void logOpComment(const BSONObj& obj) {
+ _logOp("n", "", 0, obj, 0, 0);
+ }
+ void logOpInitiate(const BSONObj& obj) {
+ _logOpRS("n", "", 0, obj, 0, 0);
+ }
+
+ /*@ @param opstr:
+ c userCreateNS
+ i insert
+ n no-op / keepalive
+ d delete / remove
+ u update
+ */
+ void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt, bool *b) {
+ if ( replSettings.master ) {
+ _logOp(opstr, ns, 0, obj, patt, b);
+ }
+
+ logOpForSharding( opstr , ns , obj , patt );
+ }
+
+ void createOplog() {
+ dblock lk;
+
+ const char * ns = "local.oplog.$main";
+
+ bool rs = !cmdLine._replSet.empty();
+ if( rs )
+ ns = rsoplog;
+
+ Client::Context ctx(ns);
+
+ NamespaceDetails * nsd = nsdetails( ns );
+
+ if ( nsd ) {
+
+ if ( cmdLine.oplogSize != 0 ) {
+ int o = (int)(nsd->storageSize() / ( 1024 * 1024 ) );
+ int n = (int)(cmdLine.oplogSize / ( 1024 * 1024 ) );
+ if ( n != o ) {
+ stringstream ss;
+ ss << "cmdline oplogsize (" << n << ") different than existing (" << o << ") see: http://dochub.mongodb.org/core/increase-oplog";
+ log() << ss.str() << endl;
+ throw UserException( 13257 , ss.str() );
+ }
+ }
+
+ if( rs ) return;
+
+ DBDirectClient c;
+ BSONObj lastOp = c.findOne( ns, Query().sort(reverseNaturalObj) );
+ if ( !lastOp.isEmpty() ) {
+ OpTime::setLast( lastOp[ "ts" ].date() );
+ }
+ return;
+ }
+
+ /* create an oplog collection, if it doesn't yet exist. */
+ BSONObjBuilder b;
+ double sz;
+ if ( cmdLine.oplogSize != 0 )
+ sz = (double)cmdLine.oplogSize;
+ else {
+ /* not specified. pick a default size */
+ sz = 50.0 * 1000 * 1000;
+ if ( sizeof(int *) >= 8 ) {
+#if defined(__APPLE__)
+ // typically these are desktops (dev machines), so keep it smallish
+ sz = (256-64) * 1000 * 1000;
+#else
+ sz = 990.0 * 1000 * 1000;
+ boost::intmax_t free = File::freeSpace(dbpath); //-1 if call not supported.
+ double fivePct = free * 0.05;
+ if ( fivePct > sz )
+ sz = fivePct;
+#endif
+ }
+ }
+
+ log() << "******" << endl;
+ log() << "creating replication oplog of size: " << (int)( sz / ( 1024 * 1024 ) ) << "MB..." << endl;
+
+ b.append("size", sz);
+ b.appendBool("capped", 1);
+ b.appendBool("autoIndexId", false);
+
+ string err;
+ BSONObj o = b.done();
+ userCreateNS(ns, o, err, false);
+ if( !rs )
+ logOp( "n", "", BSONObj() );
+
+ /* sync here so we don't get any surprising lag later when we try to sync */
+ MemoryMappedFile::flushAll(true);
+ log() << "******" << endl;
+ }
+
+ // -------------------------------------
+
+ FindingStartCursor::FindingStartCursor( const QueryPlan & qp ) :
+ _qp( qp ),
+ _findingStart( true ),
+ _findingStartMode()
+ { init(); }
+
+ void FindingStartCursor::next() {
+ if ( !_findingStartCursor || !_findingStartCursor->ok() ) {
+ _findingStart = false;
+ _c = _qp.newCursor(); // on error, start from beginning
+ destroyClientCursor();
+ return;
+ }
+ switch( _findingStartMode ) {
+ // Initial mode: scan backwards from end of collection
+ case Initial: {
+ if ( !_matcher->matchesCurrent( _findingStartCursor->c() ) ) {
+ _findingStart = false; // found first record out of query range, so scan normally
+ _c = _qp.newCursor( _findingStartCursor->currLoc() );
+ destroyClientCursor();
+ return;
+ }
+ _findingStartCursor->advance();
+ RARELY {
+ if ( _findingStartTimer.seconds() >= __findingStartInitialTimeout ) {
+ // If we've scanned enough, switch to find extent mode.
+ createClientCursor( extentFirstLoc( _findingStartCursor->currLoc() ) );
+ _findingStartMode = FindExtent;
+ return;
+ }
+ }
+ return;
+ }
+ // FindExtent mode: moving backwards through extents, check first
+ // document of each extent.
+ case FindExtent: {
+ if ( !_matcher->matchesCurrent( _findingStartCursor->c() ) ) {
+ _findingStartMode = InExtent;
+ return;
+ }
+ DiskLoc prev = prevExtentFirstLoc( _findingStartCursor->currLoc() );
+ if ( prev.isNull() ) { // hit beginning, so start scanning from here
+ createClientCursor();
+ _findingStartMode = InExtent;
+ return;
+ }
+ // There might be a more efficient implementation than creating new cursor & client cursor each time,
+ // not worrying about that for now
+ createClientCursor( prev );
+ return;
+ }
+ // InExtent mode: once an extent is chosen, find starting doc in the extent.
+ case InExtent: {
+ if ( _matcher->matchesCurrent( _findingStartCursor->c() ) ) {
+ _findingStart = false; // found first record in query range, so scan normally
+ _c = _qp.newCursor( _findingStartCursor->currLoc() );
+ destroyClientCursor();
+ return;
+ }
+ _findingStartCursor->advance();
+ return;
+ }
+ default: {
+ massert( 14038, "invalid _findingStartMode", false );
+ }
+ }
+ }
+
+ DiskLoc FindingStartCursor::extentFirstLoc( const DiskLoc &rec ) {
+ Extent *e = rec.rec()->myExtent( rec );
+ if ( !_qp.nsd()->capLooped() || ( e->myLoc != _qp.nsd()->capExtent ) )
+ return e->firstRecord;
+ // Likely we are on the fresh side of capExtent, so return first fresh record.
+ // If we are on the stale side of capExtent, then the collection is small and it
+ // doesn't matter if we start the extent scan with capFirstNewRecord.
+ return _qp.nsd()->capFirstNewRecord;
+ }
+
+ void wassertExtentNonempty( const Extent *e ) {
+ // TODO ensure this requirement is clearly enforced, or fix.
+ wassert( !e->firstRecord.isNull() );
+ }
+
+ DiskLoc FindingStartCursor::prevExtentFirstLoc( const DiskLoc &rec ) {
+ Extent *e = rec.rec()->myExtent( rec );
+ if ( _qp.nsd()->capLooped() ) {
+ if ( e->xprev.isNull() ) {
+ e = _qp.nsd()->lastExtent.ext();
+ }
+ else {
+ e = e->xprev.ext();
+ }
+ if ( e->myLoc != _qp.nsd()->capExtent ) {
+ wassertExtentNonempty( e );
+ return e->firstRecord;
+ }
+ }
+ else {
+ if ( !e->xprev.isNull() ) {
+ e = e->xprev.ext();
+ wassertExtentNonempty( e );
+ return e->firstRecord;
+ }
+ }
+ return DiskLoc(); // reached beginning of collection
+ }
+
+ void FindingStartCursor::createClientCursor( const DiskLoc &startLoc ) {
+ shared_ptr<Cursor> c = _qp.newCursor( startLoc );
+ _findingStartCursor.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, _qp.ns()) );
+ }
+
+ bool FindingStartCursor::firstDocMatchesOrEmpty() const {
+ shared_ptr<Cursor> c = _qp.newCursor();
+ return !c->ok() || _matcher->matchesCurrent( c.get() );
+ }
+
+ void FindingStartCursor::init() {
+ BSONElement tsElt = _qp.originalQuery()[ "ts" ];
+ massert( 13044, "no ts field in query", !tsElt.eoo() );
+ BSONObjBuilder b;
+ b.append( tsElt );
+ BSONObj tsQuery = b.obj();
+ _matcher.reset(new CoveredIndexMatcher(tsQuery, _qp.indexKey()));
+ if ( firstDocMatchesOrEmpty() ) {
+ _c = _qp.newCursor();
+ _findingStart = false;
+ return;
+ }
+ // Use a ClientCursor here so we can release db mutex while scanning
+ // oplog (can take quite a while with large oplogs).
+ shared_ptr<Cursor> c = _qp.newReverseCursor();
+ _findingStartCursor.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, _qp.ns(), BSONObj()) );
+ _findingStartTimer.reset();
+ _findingStartMode = Initial;
+ }
+
+ // -------------------------------------
+
+ struct TestOpTime : public UnitTest {
+ void run() {
+ OpTime t;
+ for ( int i = 0; i < 10; i++ ) {
+ OpTime s = OpTime::now_inlock();
+ assert( s != t );
+ t = s;
+ }
+ OpTime q = t;
+ assert( q == t );
+ assert( !(q != t) );
+ }
+ } testoptime;
+
+ int _dummy_z;
+
+ void pretouchN(vector<BSONObj>& v, unsigned a, unsigned b) {
+ DEV assert( !d.dbMutex.isWriteLocked() );
+
+ Client *c = currentClient.get();
+ if( c == 0 ) {
+ Client::initThread("pretouchN");
+ c = &cc();
+ }
+
+ readlock lk("");
+ for( unsigned i = a; i <= b; i++ ) {
+ const BSONObj& op = v[i];
+ const char *which = "o";
+ const char *opType = op.getStringField("op");
+ if ( *opType == 'i' )
+ ;
+ else if( *opType == 'u' )
+ which = "o2";
+ else
+ continue;
+ /* todo : other operations */
+
+ try {
+ BSONObj o = op.getObjectField(which);
+ BSONElement _id;
+ if( o.getObjectID(_id) ) {
+ const char *ns = op.getStringField("ns");
+ BSONObjBuilder b;
+ b.append(_id);
+ BSONObj result;
+ Client::Context ctx( ns );
+ if( Helpers::findById(cc(), ns, b.done(), result) )
+ _dummy_z += result.objsize(); // touch
+ }
+ }
+ catch( DBException& e ) {
+ log() << "ignoring assertion in pretouchN() " << a << ' ' << b << ' ' << i << ' ' << e.toString() << endl;
+ }
+ }
+ }
+
+ void pretouchOperation(const BSONObj& op) {
+
+ if( d.dbMutex.isWriteLocked() )
+ return; // no point pretouching if write locked. not sure if this will ever fire, but just in case.
+
+ const char *which = "o";
+ const char *opType = op.getStringField("op");
+ if ( *opType == 'i' )
+ ;
+ else if( *opType == 'u' )
+ which = "o2";
+ else
+ return;
+ /* todo : other operations */
+
+ try {
+ BSONObj o = op.getObjectField(which);
+ BSONElement _id;
+ if( o.getObjectID(_id) ) {
+ const char *ns = op.getStringField("ns");
+ BSONObjBuilder b;
+ b.append(_id);
+ BSONObj result;
+ readlock lk(ns);
+ Client::Context ctx( ns );
+ if( Helpers::findById(cc(), ns, b.done(), result) )
+ _dummy_z += result.objsize(); // touch
+ }
+ }
+ catch( DBException& ) {
+ log() << "ignoring assertion in pretouchOperation()" << endl;
+ }
+ }
+
+ BSONObj Sync::getMissingDoc(const BSONObj& o) {
+ OplogReader missingObjReader;
+
+ uassert(15916, str::stream() << "Can no longer connect to initial sync source: " << hn, missingObjReader.connect(hn));
+
+ const char *ns = o.getStringField("ns");
+ // might be more than just _id in the update criteria
+ BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj();
+ BSONObj missingObj;
+ try {
+ missingObj = missingObjReader.findOne(ns, query);
+ } catch(DBException& e) {
+ log() << "replication assertion fetching missing object: " << e.what() << endl;
+ throw;
+ }
+
+ return missingObj;
+ }
+
+ bool Sync::shouldRetry(const BSONObj& o) {
+ // we don't have the object yet, which is possible on initial sync. get it.
+ log() << "replication info adding missing object" << endl; // rare enough we can log
+
+ BSONObj missingObj = getMissingDoc(o);
+
+ if( missingObj.isEmpty() ) {
+ log() << "replication missing object not found on source. presumably deleted later in oplog" << endl;
+ log() << "replication o2: " << o.getObjectField("o2").toString() << endl;
+ log() << "replication o firstfield: " << o.getObjectField("o").firstElementFieldName() << endl;
+
+ return false;
+ }
+ else {
+ const char *ns = o.getStringField("ns");
+ Client::Context ctx(ns);
+ DiskLoc d = theDataFileMgr.insert(ns, (void*) missingObj.objdata(), missingObj.objsize());
+ uassert(15917, "Got bad disk location when attempting to insert", !d.isNull());
+
+ return true;
+ }
+ }
+
+ /** @param fromRepl false if from ApplyOpsCmd
+ @return true if was and update should have happened and the document DNE. see replset initial sync code.
+ */
+ bool applyOperation_inlock(const BSONObj& op , bool fromRepl ) {
+ assertInWriteLock();
+ LOG(6) << "applying op: " << op << endl;
+ bool failedUpdate = false;
+
+ OpCounters * opCounters = fromRepl ? &replOpCounters : &globalOpCounters;
+
+ const char *names[] = { "o", "ns", "op", "b" };
+ BSONElement fields[4];
+ op.getFields(4, names, fields);
+
+ BSONObj o;
+ if( fields[0].isABSONObj() )
+ o = fields[0].embeddedObject();
+
+ const char *ns = fields[1].valuestrsafe();
+
+ // operation type -- see logOp() comments for types
+ const char *opType = fields[2].valuestrsafe();
+
+ if ( *opType == 'i' ) {
+ opCounters->gotInsert();
+
+ const char *p = strchr(ns, '.');
+ if ( p && strcmp(p, ".system.indexes") == 0 ) {
+ // updates aren't allowed for indexes -- so we will do a regular insert. if index already
+ // exists, that is ok.
+ theDataFileMgr.insert(ns, (void*) o.objdata(), o.objsize());
+ }
+ else {
+ // do upserts for inserts as we might get replayed more than once
+ OpDebug debug;
+ BSONElement _id;
+ if( !o.getObjectID(_id) ) {
+ /* No _id. This will be very slow. */
+ Timer t;
+ updateObjects(ns, o, o, true, false, false, debug );
+ if( t.millis() >= 2 ) {
+ RARELY OCCASIONALLY log() << "warning, repl doing slow updates (no _id field) for " << ns << endl;
+ }
+ }
+ else {
+ /* erh 10/16/2009 - this is probably not relevant any more since its auto-created, but not worth removing */
+ RARELY ensureHaveIdIndex(ns); // otherwise updates will be slow
+
+ /* todo : it may be better to do an insert here, and then catch the dup key exception and do update
+ then. very few upserts will not be inserts...
+ */
+ BSONObjBuilder b;
+ b.append(_id);
+ updateObjects(ns, o, b.done(), true, false, false , debug );
+ }
+ }
+ }
+ else if ( *opType == 'u' ) {
+ opCounters->gotUpdate();
+ // dm do we create this for a capped collection?
+ // - if not, updates would be slow
+ // - but if were by id would be slow on primary too so maybe ok
+ // - if on primary was by another key and there are other indexes, this could be very bad w/out an index
+ // - if do create, odd to have on secondary but not primary. also can cause secondary to block for
+ // quite a while on creation.
+ RARELY ensureHaveIdIndex(ns); // otherwise updates will be super slow
+ OpDebug debug;
+ BSONObj updateCriteria = op.getObjectField("o2");
+ bool upsert = fields[3].booleanSafe();
+ UpdateResult ur = updateObjects(ns, o, updateCriteria, upsert, /*multi*/ false, /*logop*/ false , debug );
+ if( ur.num == 0 ) {
+ if( ur.mod ) {
+ if( updateCriteria.nFields() == 1 ) {
+ // was a simple { _id : ... } update criteria
+ failedUpdate = true;
+ // todo: probably should assert in these failedUpdate cases if not in initialSync
+ }
+ // need to check to see if it isn't present so we can set failedUpdate correctly.
+ // note that adds some overhead for this extra check in some cases, such as an updateCriteria
+ // of the form
+ // { _id:..., { x : {$size:...} }
+ // thus this is not ideal.
+ else {
+ NamespaceDetails *nsd = nsdetails(ns);
+
+ if (nsd == NULL ||
+ (nsd->findIdIndex() >= 0 && Helpers::findById(nsd, updateCriteria).isNull()) ||
+ // capped collections won't have an _id index
+ (nsd->findIdIndex() < 0 && Helpers::findOne(ns, updateCriteria, false).isNull())) {
+ failedUpdate = true;
+ }
+
+ // Otherwise, it's present; zero objects were updated because of additional specifiers
+ // in the query for idempotence
+ }
+ }
+ else {
+ // this could happen benignly on an oplog duplicate replay of an upsert
+ // (because we are idempotent),
+ // if an regular non-mod update fails the item is (presumably) missing.
+ if( !upsert ) {
+ failedUpdate = true;
+ }
+ }
+ }
+ }
+ else if ( *opType == 'd' ) {
+ opCounters->gotDelete();
+ if ( opType[1] == 0 )
+ deleteObjects(ns, o, /*justOne*/ fields[3].booleanSafe());
+ else
+ assert( opType[1] == 'b' ); // "db" advertisement
+ }
+ else if ( *opType == 'c' ) {
+ opCounters->gotCommand();
+ BufBuilder bb;
+ BSONObjBuilder ob;
+ _runCommands(ns, o, bb, ob, true, 0);
+ }
+ else if ( *opType == 'n' ) {
+ // no op
+ }
+ else {
+ throw MsgAssertionException( 14825 , ErrorMsg("error in applyOperation : unknown opType ", *opType) );
+ }
+ return failedUpdate;
+ }
+
+ class ApplyOpsCmd : public Command {
+ public:
+ virtual bool slaveOk() const { return false; }
+ virtual LockType locktype() const { return WRITE; }
+ ApplyOpsCmd() : Command( "applyOps" ) {}
+ virtual void help( stringstream &help ) const {
+ help << "internal (sharding)\n{ applyOps : [ ] , preCondition : [ { ns : ... , q : ... , res : ... } ] }";
+ }
+ virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+
+ if ( cmdObj.firstElement().type() != Array ) {
+ errmsg = "ops has to be an array";
+ return false;
+ }
+
+ BSONObj ops = cmdObj.firstElement().Obj();
+
+ {
+ // check input
+ BSONObjIterator i( ops );
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ if ( e.type() == Object )
+ continue;
+ errmsg = "op not an object: ";
+ errmsg += e.fieldName();
+ return false;
+ }
+ }
+
+ if ( cmdObj["preCondition"].type() == Array ) {
+ BSONObjIterator i( cmdObj["preCondition"].Obj() );
+ while ( i.more() ) {
+ BSONObj f = i.next().Obj();
+
+ BSONObj realres = db.findOne( f["ns"].String() , f["q"].Obj() );
+
+ Matcher m( f["res"].Obj() );
+ if ( ! m.matches( realres ) ) {
+ result.append( "got" , realres );
+ result.append( "whatFailed" , f );
+ errmsg = "pre-condition failed";
+ return false;
+ }
+ }
+ }
+
+ // apply
+ int num = 0;
+ BSONObjIterator i( ops );
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ // todo SERVER-4259 ?
+ applyOperation_inlock( e.Obj() , false );
+ num++;
+ }
+
+ result.append( "applied" , num );
+
+ if ( ! fromRepl ) {
+ // We want this applied atomically on slaves
+ // so we re-wrap without the pre-condition for speed
+
+ string tempNS = str::stream() << dbname << ".$cmd";
+
+ logOp( "c" , tempNS.c_str() , cmdObj.firstElement().wrap() );
+ }
+
+ return true;
+ }
+
+ DBDirectClient db;
+
+ } applyOpsCmd;
+
+}
diff --git a/src/mongo/db/oplog.h b/src/mongo/db/oplog.h
new file mode 100644
index 00000000000..6c1644fe3ab
--- /dev/null
+++ b/src/mongo/db/oplog.h
@@ -0,0 +1,149 @@
+// oplog.h - writing to and reading from oplog
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+
+ local.oplog.$main is the default
+*/
+
+#pragma once
+
+#include "pdfile.h"
+#include "db.h"
+#include "dbhelpers.h"
+#include "clientcursor.h"
+#include "../client/dbclient.h"
+#include "../util/optime.h"
+#include "../util/timer.h"
+
+namespace mongo {
+
+ void createOplog();
+
+ void _logOpObjRS(const BSONObj& op);
+
+ /** Write operation to the log (local.oplog.$main)
+
+ @param opstr
+ "i" insert
+ "u" update
+ "d" delete
+ "c" db cmd
+ "n" no-op
+ "db" declares presence of a database (ns is set to the db name + '.')
+
+ See _logOp() in oplog.cpp for more details.
+ */
+ void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt = 0, bool *b = 0);
+
+ void logKeepalive();
+
+ /** puts obj in the oplog as a comment (a no-op). Just for diags.
+ convention is
+ { msg : "text", ... }
+ */
+ void logOpComment(const BSONObj& obj);
+
+ void oplogCheckCloseDatabase( Database * db );
+
+ extern int __findingStartInitialTimeout; // configurable for testing
+
+ class QueryPlan;
+
+ /** Implements an optimized procedure for finding the first op in the oplog. */
+ class FindingStartCursor {
+ public:
+
+ /**
+ * The cursor will attempt to find the first op in the oplog matching the
+ * 'ts' field of the qp's query.
+ */
+ FindingStartCursor( const QueryPlan & qp );
+
+ /** @return true if the first matching op in the oplog has been found. */
+ bool done() const { return !_findingStart; }
+
+ /** @return cursor pointing to the first matching op, if done(). */
+ shared_ptr<Cursor> cursor() { verify( 14835, done() ); return _c; }
+
+ /** Iterate the cursor, to continue trying to find matching op. */
+ void next();
+
+ /** Yield cursor, if not done(). */
+ bool prepareToYield() {
+ if ( _findingStartCursor ) {
+ return _findingStartCursor->prepareToYield( _yieldData );
+ }
+ return false;
+ }
+
+ /** Recover from cursor yield. */
+ void recoverFromYield() {
+ if ( _findingStartCursor ) {
+ if ( !ClientCursor::recoverFromYield( _yieldData ) ) {
+ _findingStartCursor.reset( 0 );
+ msgassertedNoTrace( 15889, "FindingStartCursor::recoverFromYield() failed to recover" );
+ }
+ }
+ }
+ private:
+ enum FindingStartMode { Initial, FindExtent, InExtent };
+ const QueryPlan &_qp;
+ bool _findingStart;
+ FindingStartMode _findingStartMode;
+ auto_ptr< CoveredIndexMatcher > _matcher;
+ Timer _findingStartTimer;
+ ClientCursor::CleanupPointer _findingStartCursor;
+ shared_ptr<Cursor> _c;
+ ClientCursor::YieldData _yieldData;
+ DiskLoc extentFirstLoc( const DiskLoc &rec );
+
+ DiskLoc prevExtentFirstLoc( const DiskLoc &rec );
+ void createClientCursor( const DiskLoc &startLoc = DiskLoc() );
+ void destroyClientCursor() {
+ _findingStartCursor.reset( 0 );
+ }
+ void init();
+ bool firstDocMatchesOrEmpty() const;
+ };
+
+ class Sync {
+ protected:
+ string hn;
+ public:
+ Sync(const string& hostname) : hn(hostname) {}
+ virtual ~Sync() {}
+ virtual BSONObj getMissingDoc(const BSONObj& o);
+
+ /**
+ * If applyOperation_inlock should be called again after an update fails.
+ */
+ virtual bool shouldRetry(const BSONObj& o);
+ };
+
+ void pretouchOperation(const BSONObj& op);
+ void pretouchN(vector<BSONObj>&, unsigned a, unsigned b);
+
+ /**
+ * take an op and apply locally
+ * used for applying from an oplog
+ * @param fromRepl really from replication or for testing/internal/command/etc...
+ * Returns if the op was an update that could not be applied (true on failure)
+ */
+ bool applyOperation_inlock(const BSONObj& op , bool fromRepl = true );
+}
diff --git a/src/mongo/db/oplogreader.h b/src/mongo/db/oplogreader.h
new file mode 100644
index 00000000000..6efd1469c01
--- /dev/null
+++ b/src/mongo/db/oplogreader.h
@@ -0,0 +1,121 @@
+/** @file oplogreader.h */
+
+#pragma once
+
+#include "../client/dbclient.h"
+#include "../client/constants.h"
+#include "dbhelpers.h"
+
+namespace mongo {
+
+ /* started abstracting out the querying of the primary/master's oplog
+ still fairly awkward but a start.
+ */
+ class OplogReader {
+ shared_ptr<DBClientConnection> _conn;
+ shared_ptr<DBClientCursor> cursor;
+ public:
+ OplogReader() { }
+ ~OplogReader() { }
+ void resetCursor() { cursor.reset(); }
+ void resetConnection() {
+ cursor.reset();
+ _conn.reset();
+ }
+ DBClientConnection* conn() { return _conn.get(); }
+ BSONObj findOne(const char *ns, const Query& q) {
+ return conn()->findOne(ns, q, 0, QueryOption_SlaveOk);
+ }
+ BSONObj getLastOp(const char *ns) {
+ return findOne(ns, Query().sort(reverseNaturalObj));
+ }
+
+ /* ok to call if already connected */
+ bool connect(string hostname);
+
+ bool connect(const BSONObj& rid, const int from, const string& to);
+
+ void tailCheck() {
+ if( cursor.get() && cursor->isDead() ) {
+ log() << "repl: old cursor isDead, will initiate a new one" << endl;
+ resetCursor();
+ }
+ }
+
+ bool haveCursor() { return cursor.get() != 0; }
+
+ /** this is ok but commented out as when used one should consider if QueryOption_OplogReplay
+ is needed; if not fine, but if so, need to change.
+ *//*
+ void query(const char *ns, const BSONObj& query) {
+ assert( !haveCursor() );
+ cursor.reset( _conn->query(ns, query, 0, 0, 0, QueryOption_SlaveOk).release() );
+ }*/
+
+ /** this can be used; it is commented out as it does not indicate
+ QueryOption_OplogReplay and that is likely important. could be uncommented
+ just need to add that.
+ */
+ /*
+ void queryGTE(const char *ns, OpTime t) {
+ BSONObjBuilder q;
+ q.appendDate("$gte", t.asDate());
+ BSONObjBuilder q2;
+ q2.append("ts", q.done());
+ query(ns, q2.done());
+ }
+ */
+
+ void tailingQuery(const char *ns, const BSONObj& query, const BSONObj* fields=0) {
+ assert( !haveCursor() );
+ log(2) << "repl: " << ns << ".find(" << query.toString() << ')' << endl;
+ cursor.reset( _conn->query( ns, query, 0, 0, fields,
+ QueryOption_CursorTailable | QueryOption_SlaveOk | QueryOption_OplogReplay |
+ /* TODO: slaveOk maybe shouldn't use? */
+ QueryOption_AwaitData
+ ).release() );
+ }
+
+ void tailingQueryGTE(const char *ns, OpTime t, const BSONObj* fields=0) {
+ BSONObjBuilder q;
+ q.appendDate("$gte", t.asDate());
+ BSONObjBuilder query;
+ query.append("ts", q.done());
+ tailingQuery(ns, query.done(), fields);
+ }
+
+ /* Do a tailing query, but only send the ts field back. */
+ void ghostQueryGTE(const char *ns, OpTime t) {
+ const BSONObj fields = BSON("ts" << 1 << "_id" << 0);
+ return tailingQueryGTE(ns, t, &fields);
+ }
+
+ bool more() {
+ uassert( 15910, "Doesn't have cursor for reading oplog", cursor.get() );
+ return cursor->more();
+ }
+
+ bool moreInCurrentBatch() {
+ uassert( 15911, "Doesn't have cursor for reading oplog", cursor.get() );
+ return cursor->moreInCurrentBatch();
+ }
+
+ /* old mongod's can't do the await flag... */
+ bool awaitCapable() {
+ return cursor->hasResultFlag(ResultFlag_AwaitCapable);
+ }
+
+ void peek(vector<BSONObj>& v, int n) {
+ if( cursor.get() )
+ cursor->peek(v,n);
+ }
+ BSONObj nextSafe() { return cursor->nextSafe(); }
+ BSONObj next() { return cursor->next(); }
+ void putBack(BSONObj op) { cursor->putBack(op); }
+
+ private:
+ bool commonConnect(const string& hostName);
+ bool passthroughHandshake(const BSONObj& rid, const int f);
+ };
+
+}
diff --git a/src/mongo/db/ops/count.cpp b/src/mongo/db/ops/count.cpp
new file mode 100644
index 00000000000..3c183596b9d
--- /dev/null
+++ b/src/mongo/db/ops/count.cpp
@@ -0,0 +1,103 @@
+// count.cpp
+
+/**
+ * Copyright (C) 2008 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "count.h"
+
+#include "../client.h"
+#include "../clientcursor.h"
+#include "../namespace.h"
+#include "../queryutil.h"
+
+namespace mongo {
+
+ long long runCount( const char *ns, const BSONObj &cmd, string &err ) {
+ Client::Context cx(ns);
+ NamespaceDetails *d = nsdetails( ns );
+ if ( !d ) {
+ err = "ns missing";
+ return -1;
+ }
+ BSONObj query = cmd.getObjectField("query");
+
+ // count of all objects
+ if ( query.isEmpty() ) {
+ return applySkipLimit( d->stats.nrecords , cmd );
+ }
+
+ string exceptionInfo;
+ long long count = 0;
+ long long skip = cmd["skip"].numberLong();
+ long long limit = cmd["limit"].numberLong();
+ bool simpleEqualityMatch;
+ shared_ptr<Cursor> cursor = NamespaceDetailsTransient::getCursor( ns, query, BSONObj(), false, &simpleEqualityMatch );
+ ClientCursor::CleanupPointer ccPointer;
+ ElapsedTracker timeToStartYielding( 256, 20 );
+ try {
+ while( cursor->ok() ) {
+ if ( !ccPointer ) {
+ if ( timeToStartYielding.intervalHasElapsed() ) {
+ // Lazily construct a ClientCursor, avoiding a performance regression when scanning a very
+ // small number of documents.
+ ccPointer.reset( new ClientCursor( QueryOption_NoCursorTimeout, cursor, ns ) );
+ }
+ }
+ else if ( !ccPointer->yieldSometimes( simpleEqualityMatch ? ClientCursor::DontNeed : ClientCursor::MaybeCovered ) ||
+ !cursor->ok() ) {
+ break;
+ }
+
+ // With simple equality matching there is no need to use the matcher because the bounds
+ // are enforced by the FieldRangeVectorIterator and only key fields have constraints. There
+ // is no need to do key deduping because an exact value is specified in the query for all key
+ // fields and duplicate keys are not allowed per document.
+ // NOTE In the distant past we used a min/max bounded BtreeCursor with a shallow
+ // equality comparison to check for matches in the simple match case. That may be
+ // more performant, but I don't think we've measured the performance.
+ if ( simpleEqualityMatch ||
+ ( cursor->currentMatches() && !cursor->getsetdup( cursor->currLoc() ) ) ) {
+
+ if ( skip > 0 ) {
+ --skip;
+ }
+ else {
+ ++count;
+ if ( limit > 0 && count >= limit ) {
+ break;
+ }
+ }
+ }
+ cursor->advance();
+ }
+ ccPointer.reset();
+ return count;
+
+ } catch ( const DBException &e ) {
+ exceptionInfo = e.toString();
+ } catch ( const std::exception &e ) {
+ exceptionInfo = e.what();
+ } catch ( ... ) {
+ exceptionInfo = "unknown exception";
+ }
+ // Historically we have returned zero in many count assertion cases - see SERVER-2291.
+ log() << "Count with ns: " << ns << " and query: " << query
+ << " failed with exception: " << exceptionInfo
+ << endl;
+ return 0;
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/ops/count.h b/src/mongo/db/ops/count.h
new file mode 100644
index 00000000000..807741e1253
--- /dev/null
+++ b/src/mongo/db/ops/count.h
@@ -0,0 +1,30 @@
+// count.h
+
+/**
+ * Copyright (C) 2008 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../jsobj.h"
+#include "../diskloc.h"
+
+namespace mongo {
+
+ /**
+ * { count: "collectionname"[, query: <query>] }
+ * @return -1 on ns does not exist error and other errors, 0 on other errors, otherwise the match count.
+ */
+ long long runCount(const char *ns, const BSONObj& cmd, string& err);
+
+} // namespace mongo
diff --git a/src/mongo/db/ops/delete.cpp b/src/mongo/db/ops/delete.cpp
new file mode 100644
index 00000000000..e33611c151e
--- /dev/null
+++ b/src/mongo/db/ops/delete.cpp
@@ -0,0 +1,158 @@
+// delete.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "delete.h"
+#include "../queryoptimizer.h"
+#include "../oplog.h"
+
+namespace mongo {
+
+ /* ns: namespace, e.g. <database>.<collection>
+ pattern: the "where" clause / criteria
+ justOne: stop after 1 match
+ god: allow access to system namespaces, and don't yield
+ */
+ long long deleteObjects(const char *ns, BSONObj pattern, bool justOneOrig, bool logop, bool god, RemoveSaver * rs ) {
+ if( !god ) {
+ if ( strstr(ns, ".system.") ) {
+ /* note a delete from system.indexes would corrupt the db
+ if done here, as there are pointers into those objects in
+ NamespaceDetails.
+ */
+ uassert(12050, "cannot delete from system namespace", legalClientSystemNS( ns , true ) );
+ }
+ if ( strchr( ns , '$' ) ) {
+ log() << "cannot delete from collection with reserved $ in name: " << ns << endl;
+ uassert( 10100 , "cannot delete from collection with reserved $ in name", strchr(ns, '$') == 0 );
+ }
+ }
+
+ {
+ NamespaceDetails *d = nsdetails( ns );
+ if ( ! d )
+ return 0;
+ uassert( 10101 , "can't remove from a capped collection" , ! d->capped );
+ }
+
+ long long nDeleted = 0;
+
+ shared_ptr< Cursor > creal = NamespaceDetailsTransient::getCursor( ns, pattern, BSONObj(), false, 0 );
+
+ if( !creal->ok() )
+ return nDeleted;
+
+ shared_ptr< Cursor > cPtr = creal;
+ auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout, cPtr, ns) );
+ cc->setDoingDeletes( true );
+
+ CursorId id = cc->cursorid();
+
+ bool justOne = justOneOrig;
+ bool canYield = !god && !(creal->matcher() && creal->matcher()->docMatcher().atomic());
+
+ do {
+ // TODO: we can generalize this I believe
+ //
+ bool willNeedRecord = (creal->matcher() && creal->matcher()->needRecord()) || pattern.isEmpty() || isSimpleIdQuery( pattern );
+ if ( ! willNeedRecord ) {
+ // TODO: this is a total hack right now
+ // check if the index full encompasses query
+
+ if ( pattern.nFields() == 1 &&
+ str::equals( pattern.firstElement().fieldName() , creal->indexKeyPattern().firstElement().fieldName() ) )
+ willNeedRecord = true;
+ }
+
+ if ( canYield && ! cc->yieldSometimes( willNeedRecord ? ClientCursor::WillNeed : ClientCursor::MaybeCovered ) ) {
+ cc.release(); // has already been deleted elsewhere
+ // TODO should we assert or something?
+ break;
+ }
+ if ( !cc->ok() ) {
+ break; // if we yielded, could have hit the end
+ }
+
+ // this way we can avoid calling updateLocation() every time (expensive)
+ // as well as some other nuances handled
+ cc->setDoingDeletes( true );
+
+ DiskLoc rloc = cc->currLoc();
+ BSONObj key = cc->currKey();
+
+ bool match = creal->currentMatches();
+ bool dup = cc->c()->getsetdup(rloc);
+
+ if ( ! cc->advance() )
+ justOne = true;
+
+ if ( ! match )
+ continue;
+
+ assert( !dup ); // can't be a dup, we deleted it!
+
+ if ( !justOne ) {
+ /* NOTE: this is SLOW. this is not good, noteLocation() was designed to be called across getMore
+ blocks. here we might call millions of times which would be bad.
+ */
+ cc->c()->prepareToTouchEarlierIterate();
+ }
+
+ if ( logop ) {
+ BSONElement e;
+ if( BSONObj( rloc.rec() ).getObjectID( e ) ) {
+ BSONObjBuilder b;
+ b.append( e );
+ bool replJustOne = true;
+ logOp( "d", ns, b.done(), 0, &replJustOne );
+ }
+ else {
+ problem() << "deleted object without id, not logging" << endl;
+ }
+ }
+
+ if ( rs )
+ rs->goingToDelete( rloc.obj() /*cc->c->current()*/ );
+
+ theDataFileMgr.deleteRecord(ns, rloc.rec(), rloc);
+ nDeleted++;
+ if ( justOne ) {
+ break;
+ }
+ cc->c()->recoverFromTouchingEarlierIterate();
+
+ if( !god )
+ getDur().commitIfNeeded();
+
+ if( debug && god && nDeleted == 100 )
+ log() << "warning high number of deletes with god=true which could use significant memory" << endl;
+ }
+ while ( cc->ok() );
+
+ if ( cc.get() && ClientCursor::find( id , false ) == 0 ) {
+ // TODO: remove this and the id declaration above if this doesn't trigger
+ // if it does, then i'm very confused (ERH 06/2011)
+ error() << "this should be impossible" << endl;
+ printStackTrace();
+ cc.release();
+ }
+
+ return nDeleted;
+ }
+
+}
diff --git a/src/mongo/db/ops/delete.h b/src/mongo/db/ops/delete.h
new file mode 100644
index 00000000000..a74b7a664bc
--- /dev/null
+++ b/src/mongo/db/ops/delete.h
@@ -0,0 +1,33 @@
+// delete.h
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../../pch.h"
+#include "../jsobj.h"
+#include "../clientcursor.h"
+
+namespace mongo {
+
+ class RemoveSaver;
+
+ // If justOne is true, deletedId is set to the id of the deleted object.
+ long long deleteObjects(const char *ns, BSONObj pattern, bool justOne, bool logop = false, bool god=false, RemoveSaver * rs=0);
+
+
+}
diff --git a/src/mongo/db/ops/query.cpp b/src/mongo/db/ops/query.cpp
new file mode 100644
index 00000000000..15e3ed9053f
--- /dev/null
+++ b/src/mongo/db/ops/query.cpp
@@ -0,0 +1,870 @@
+// query.cpp
+
+/**
+ * Copyright (C) 2008 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "query.h"
+#include "../pdfile.h"
+#include "../jsobjmanipulator.h"
+#include "../../bson/util/builder.h"
+#include <time.h>
+#include "../introspect.h"
+#include "../btree.h"
+#include "../../util/lruishmap.h"
+#include "../json.h"
+#include "../repl.h"
+#include "../replutil.h"
+#include "../scanandorder.h"
+#include "../security.h"
+#include "../curop-inl.h"
+#include "../commands.h"
+#include "../queryoptimizer.h"
+#include "../lasterror.h"
+#include "../../s/d_logic.h"
+#include "../repl_block.h"
+#include "../../server.h"
+#include "../d_concurrency.h"
+
+namespace mongo {
+
+ /* We cut off further objects once we cross this threshold; thus, you might get
+ a little bit more than this, it is a threshold rather than a limit.
+ */
+ const int MaxBytesToReturnToClientAtOnce = 4 * 1024 * 1024;
+
+ //ns->query->DiskLoc
+// LRUishMap<BSONObj,DiskLoc,5> lrutest(123);
+
+ extern bool useCursors;
+ extern bool useHints;
+
+ bool runCommands(const char *ns, BSONObj& jsobj, CurOp& curop, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions) {
+ try {
+ return _runCommands(ns, jsobj, b, anObjBuilder, fromRepl, queryOptions);
+ }
+ catch( SendStaleConfigException& ){
+ throw;
+ }
+ catch ( AssertionException& e ) {
+ assert( e.getCode() != SendStaleConfigCode && e.getCode() != RecvStaleConfigCode );
+
+ e.getInfo().append( anObjBuilder , "assertion" , "assertionCode" );
+ curop.debug().exceptionInfo = e.getInfo();
+ }
+ anObjBuilder.append("errmsg", "db assertion failure");
+ anObjBuilder.append("ok", 0.0);
+ BSONObj x = anObjBuilder.done();
+ b.appendBuf((void*) x.objdata(), x.objsize());
+ return true;
+ }
+
+
+ BSONObj id_obj = fromjson("{\"_id\":1}");
+ BSONObj empty_obj = fromjson("{}");
+
+
+ //int dump = 0;
+
+ /* empty result for error conditions */
+ QueryResult* emptyMoreResult(long long cursorid) {
+ BufBuilder b(32768);
+ b.skip(sizeof(QueryResult));
+ QueryResult *qr = (QueryResult *) b.buf();
+ qr->cursorId = 0; // 0 indicates no more data to retrieve.
+ qr->startingFrom = 0;
+ qr->len = b.len();
+ qr->setOperation(opReply);
+ qr->initializeResultFlags();
+ qr->nReturned = 0;
+ b.decouple();
+ return qr;
+ }
+
+ QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& curop, int pass, bool& exhaust ) {
+ exhaust = false;
+ ClientCursor::Pointer p(cursorid);
+ ClientCursor *cc = p.c();
+
+ int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce;
+
+ BufBuilder b( bufSize );
+ b.skip(sizeof(QueryResult));
+ int resultFlags = ResultFlag_AwaitCapable;
+ int start = 0;
+ int n = 0;
+
+ if ( unlikely(!cc) ) {
+ LOGSOME << "getMore: cursorid not found " << ns << " " << cursorid << endl;
+ cursorid = 0;
+ resultFlags = ResultFlag_CursorNotFound;
+ }
+ else {
+ // check for spoofing of the ns such that it does not match the one originally there for the cursor
+ uassert(14833, "auth error", str::equals(ns, cc->ns().c_str()));
+
+ if ( pass == 0 )
+ cc->updateSlaveLocation( curop );
+
+ int queryOptions = cc->queryOptions();
+
+ curop.debug().query = cc->query();
+
+ start = cc->pos();
+ Cursor *c = cc->c();
+ c->checkLocation();
+ DiskLoc last;
+
+ scoped_ptr<Projection::KeyOnly> keyFieldsOnly;
+ if ( cc->modifiedKeys() == false && cc->isMultiKey() == false && cc->fields )
+ keyFieldsOnly.reset( cc->fields->checkKey( cc->indexKeyPattern() ) );
+
+ // This manager may be stale, but it's the state of chunking when the cursor was created.
+ ShardChunkManagerPtr manager = cc->getChunkManager();
+
+ while ( 1 ) {
+ if ( !c->ok() ) {
+ if ( c->tailable() ) {
+ /* when a tailable cursor hits "EOF", ok() goes false, and current() is null. however
+ advance() can still be retries as a reactivation attempt. when there is new data, it will
+ return true. that's what we are doing here.
+ */
+ if ( c->advance() )
+ continue;
+
+ if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) {
+ return 0;
+ }
+
+ break;
+ }
+ p.release();
+ bool ok = ClientCursor::erase(cursorid);
+ assert(ok);
+ cursorid = 0;
+ cc = 0;
+ break;
+ }
+
+ // in some cases (clone collection) there won't be a matcher
+ if ( c->matcher() && !c->matcher()->matchesCurrent( c ) ) {
+ }
+ else if ( manager && ! manager->belongsToMe( cc ) ){
+ LOG(2) << "cursor skipping document in un-owned chunk: " << c->current() << endl;
+ }
+ else {
+ if( c->getsetdup(c->currLoc()) ) {
+ //out() << " but it's a dup \n";
+ }
+ else {
+ last = c->currLoc();
+ n++;
+
+ if ( keyFieldsOnly ) {
+ fillQueryResultFromObj(b, 0, keyFieldsOnly->hydrate( c->currKey() ) );
+ }
+ else {
+ BSONObj js = c->current();
+ // show disk loc should be part of the main query, not in an $or clause, so this should be ok
+ fillQueryResultFromObj(b, cc->fields.get(), js, ( cc->pq.get() && cc->pq->showDiskLoc() ? &last : 0));
+ }
+
+ if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) {
+ c->advance();
+ cc->incPos( n );
+ break;
+ }
+ }
+ }
+ c->advance();
+
+ if ( ! cc->yieldSometimes( ClientCursor::MaybeCovered ) ) {
+ ClientCursor::erase(cursorid);
+ cursorid = 0;
+ cc = 0;
+ p.deleted();
+ break;
+ }
+ }
+
+ if ( cc ) {
+ cc->updateLocation();
+ cc->mayUpgradeStorage();
+ cc->storeOpForSlave( last );
+ exhaust = cc->queryOptions() & QueryOption_Exhaust;
+ }
+ }
+
+ QueryResult *qr = (QueryResult *) b.buf();
+ qr->len = b.len();
+ qr->setOperation(opReply);
+ qr->_resultFlags() = resultFlags;
+ qr->cursorId = cursorid;
+ qr->startingFrom = start;
+ qr->nReturned = n;
+ b.decouple();
+
+ return qr;
+ }
+
+ class ExplainBuilder {
+ // Note: by default we filter out allPlans and oldPlan in the shell's
+ // explain() function. If you add any recursive structures, make sure to
+ // edit the JS to make sure everything gets filtered.
+ public:
+ ExplainBuilder() : _i() {}
+ void ensureStartScan() {
+ if ( !_a.get() ) {
+ _a.reset( new BSONArrayBuilder() );
+ }
+ }
+ void noteCursor( Cursor *c ) {
+ BSONObjBuilder b( _a->subobjStart() );
+ b << "cursor" << c->toString() << "indexBounds" << c->prettyIndexBounds();
+ b.done();
+ }
+ void noteScan( Cursor *c, long long nscanned, long long nscannedObjects, int n, bool scanAndOrder,
+ int millis, bool hint, int nYields , int nChunkSkips , bool indexOnly ) {
+ if ( _i == 1 ) {
+ _c.reset( new BSONArrayBuilder() );
+ *_c << _b->obj();
+ }
+ if ( _i == 0 ) {
+ _b.reset( new BSONObjBuilder() );
+ }
+ else {
+ _b.reset( new BSONObjBuilder( _c->subobjStart() ) );
+ }
+ *_b << "cursor" << c->toString();
+ _b->appendNumber( "nscanned", nscanned );
+ _b->appendNumber( "nscannedObjects", nscannedObjects );
+ *_b << "n" << n;
+
+ if ( scanAndOrder )
+ *_b << "scanAndOrder" << true;
+
+ *_b << "millis" << millis;
+
+ *_b << "nYields" << nYields;
+ *_b << "nChunkSkips" << nChunkSkips;
+ *_b << "isMultiKey" << c->isMultiKey();
+ *_b << "indexOnly" << indexOnly;
+
+ *_b << "indexBounds" << c->prettyIndexBounds();
+
+ c->explainDetails( *_b );
+
+ if ( !hint ) {
+ *_b << "allPlans" << _a->arr();
+ }
+ if ( _i != 0 ) {
+ _b->done();
+ }
+ _a.reset( 0 );
+ ++_i;
+ }
+ BSONObj finishWithSuffix( long long nscanned, long long nscannedObjects, int n, int millis, const BSONObj &suffix ) {
+ if ( _i > 1 ) {
+ BSONObjBuilder b;
+ b << "clauses" << _c->arr();
+ b.appendNumber( "nscanned", nscanned );
+ b.appendNumber( "nscannedObjects", nscannedObjects );
+ b << "n" << n;
+ b << "millis" << millis;
+ b.appendElements( suffix );
+ return b.obj();
+ }
+ else {
+ stringstream host;
+ host << getHostNameCached() << ":" << cmdLine.port;
+ *_b << "server" << host.str();
+ _b->appendElements( suffix );
+ return _b->obj();
+ }
+ }
+ private:
+ auto_ptr< BSONArrayBuilder > _a;
+ auto_ptr< BSONObjBuilder > _b;
+ auto_ptr< BSONArrayBuilder > _c;
+ int _i;
+ };
+
+ // Implements database 'query' requests using the query optimizer's QueryOp interface
+ class UserQueryOp : public QueryOp {
+ public:
+
+ UserQueryOp( const ParsedQuery& pq, Message &response, ExplainBuilder &eb, CurOp &curop ) :
+ _buf( 32768 ) , // TODO be smarter here
+ _pq( pq ) ,
+ _ntoskip( pq.getSkip() ) ,
+ _nscanned(0), _oldNscanned(0), _nscannedObjects(0), _oldNscannedObjects(0),
+ _n(0),
+ _oldN(0),
+ _nYields(),
+ _nChunkSkips(),
+ _chunkManager( shardingState.needShardChunkManager(pq.ns()) ?
+ shardingState.getShardChunkManager(pq.ns()) : ShardChunkManagerPtr() ),
+ _inMemSort(false),
+ _capped(false),
+ _saveClientCursor(false),
+ _wouldSaveClientCursor(false),
+ _oplogReplay( pq.hasOption( QueryOption_OplogReplay) ),
+ _response( response ),
+ _eb( eb ),
+ _curop( curop ),
+ _yieldRecoveryFailed()
+ {}
+
+ virtual void _init() {
+ // only need to put the QueryResult fields there if we're building the first buffer in the message.
+ if ( _response.empty() ) {
+ _buf.skip( sizeof( QueryResult ) );
+ }
+
+ if ( _oplogReplay ) {
+ _findingStartCursor.reset( new FindingStartCursor( qp() ) );
+ _capped = true;
+ }
+ else {
+ _c = qp().newCursor( DiskLoc() , _pq.getNumToReturn() + _pq.getSkip() );
+ _capped = _c->capped();
+
+ // setup check for if we can only use index to extract
+ if ( _c->modifiedKeys() == false && _c->isMultiKey() == false && _pq.getFields() ) {
+ _keyFieldsOnly.reset( _pq.getFields()->checkKey( _c->indexKeyPattern() ) );
+ }
+ }
+
+ if ( qp().scanAndOrderRequired() ) {
+ _inMemSort = true;
+ _so.reset( new ScanAndOrder( _pq.getSkip() , _pq.getNumToReturn() , _pq.getOrder(), qp().multikeyFrs() ) );
+ }
+
+ if ( _pq.isExplain() ) {
+ _eb.noteCursor( _c.get() );
+ }
+
+ }
+
+ virtual bool prepareToYield() {
+ if ( _findingStartCursor.get() ) {
+ return _findingStartCursor->prepareToYield();
+ }
+ else {
+ if ( _c && !_cc ) {
+ _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , _pq.ns() ) );
+ }
+ if ( _cc ) {
+ return _cc->prepareToYield( _yieldData );
+ }
+ }
+ // no active cursor - ok to yield
+ return true;
+ }
+
+ virtual void recoverFromYield() {
+ _nYields++;
+
+ if ( _findingStartCursor.get() ) {
+ _findingStartCursor->recoverFromYield();
+ }
+ else if ( _cc && !ClientCursor::recoverFromYield( _yieldData ) ) {
+ _yieldRecoveryFailed = true;
+ _c.reset();
+ _cc.reset();
+ _so.reset();
+
+ if ( _capped ) {
+ msgassertedNoTrace( 13338, str::stream() << "capped cursor overrun during query: " << _pq.ns() );
+ }
+ else if ( qp().mustAssertOnYieldFailure() ) {
+ msgassertedNoTrace( 15890, str::stream() << "UserQueryOp::recoverFromYield() failed to recover: " << _pq.ns() );
+ }
+ else {
+ // we don't fail query since we're fine with returning partial data if collection dropped
+
+ // todo: this is wrong. the cursor could be gone if closeAllDatabases command just ran
+ }
+
+ }
+ }
+
+ virtual long long nscanned() {
+ if ( _findingStartCursor.get() ) {
+ return 0; // should only be one query plan, so value doesn't really matter.
+ }
+ return _c.get() ? _c->nscanned() : _nscanned;
+ }
+
+ virtual void next() {
+ if ( _findingStartCursor.get() ) {
+ if ( !_findingStartCursor->done() ) {
+ _findingStartCursor->next();
+ }
+ if ( _findingStartCursor->done() ) {
+ _c = _findingStartCursor->cursor();
+ _findingStartCursor.reset( 0 );
+ }
+ _capped = true;
+ return;
+ }
+
+ if ( !_c || !_c->ok() ) {
+ finish( false );
+ return;
+ }
+
+ bool mayCreateCursor1 = _pq.wantMore() && ! _inMemSort && _pq.getNumToReturn() != 1 && useCursors;
+
+ if( 0 ) {
+ cout << "SCANNING this: " << this << " key: " << _c->currKey() << " obj: " << _c->current() << endl;
+ }
+
+ if ( _pq.getMaxScan() && _nscanned >= _pq.getMaxScan() ) {
+ finish( true ); //?
+ return;
+ }
+
+ _nscanned = _c->nscanned();
+ if ( !matcher( _c )->matchesCurrent(_c.get() , &_details ) ) {
+ // not a match, continue onward
+ if ( _details._loadedObject )
+ _nscannedObjects++;
+ }
+ else {
+ _nscannedObjects++;
+ DiskLoc cl = _c->currLoc();
+ if ( _chunkManager && ! _chunkManager->belongsToMe( cl.obj() ) ) { // TODO: should make this covered at some point
+ _nChunkSkips++;
+ // log() << "TEMP skipping un-owned chunk: " << _c->current() << endl;
+ }
+ else if( _c->getsetdup(cl) ) {
+ // dup
+ }
+ else {
+ // got a match.
+
+ if ( _inMemSort ) {
+ // note: no cursors for non-indexed, ordered results. results must be fairly small.
+ _so->add( _pq.returnKey() ? _c->currKey() : _c->current(), _pq.showDiskLoc() ? &cl : 0 );
+ }
+ else if ( _ntoskip > 0 ) {
+ _ntoskip--;
+ }
+ else {
+ if ( _pq.isExplain() ) {
+ _n++;
+ if ( n() >= _pq.getNumToReturn() && !_pq.wantMore() ) {
+ // .limit() was used, show just that much.
+ finish( true ); //?
+ return;
+ }
+ }
+ else {
+
+ if ( _pq.returnKey() ) {
+ BSONObjBuilder bb( _buf );
+ bb.appendKeys( _c->indexKeyPattern() , _c->currKey() );
+ bb.done();
+ }
+ else if ( _keyFieldsOnly ) {
+ fillQueryResultFromObj( _buf , 0 , _keyFieldsOnly->hydrate( _c->currKey() ) );
+ }
+ else {
+ BSONObj js = _c->current();
+ assert( js.isValid() );
+
+ if ( _oplogReplay ) {
+ BSONElement e = js["ts"];
+ if ( e.type() == Date || e.type() == Timestamp )
+ _slaveReadTill = e._opTime();
+ }
+
+ fillQueryResultFromObj( _buf , _pq.getFields() , js , (_pq.showDiskLoc() ? &cl : 0));
+ }
+ _n++;
+ if ( ! _c->supportGetMore() ) {
+ if ( _pq.enough( n() ) || _buf.len() >= MaxBytesToReturnToClientAtOnce ) {
+ finish( true );
+ return;
+ }
+ }
+ else if ( _pq.enoughForFirstBatch( n() , _buf.len() ) ) {
+ /* if only 1 requested, no cursor saved for efficiency...we assume it is findOne() */
+ if ( mayCreateCursor1 ) {
+ _wouldSaveClientCursor = true;
+ if ( _c->advance() ) {
+ // more...so save a cursor
+ _saveClientCursor = true;
+ }
+ }
+ finish( true );
+ return;
+ }
+ }
+ }
+ }
+ }
+ _c->advance();
+ }
+
+ // this plan won, so set data for response broadly
+ void finish( bool stop ) {
+ massert( 13638, "client cursor dropped during explain query yield", !_pq.isExplain() || _c.get() );
+
+ if ( _pq.isExplain() ) {
+ _n = _inMemSort ? _so->size() : _n;
+ }
+ else if ( _inMemSort ) {
+ if( _so.get() )
+ _so->fill( _buf, _pq.getFields() , _n );
+ }
+
+ if ( _c.get() ) {
+ _nscanned = _c->nscanned();
+
+ if ( _pq.hasOption( QueryOption_CursorTailable ) && _pq.getNumToReturn() != 1 )
+ _c->setTailable();
+
+ // If the tailing request succeeded.
+ if ( _c->tailable() )
+ _saveClientCursor = true;
+ }
+
+ if ( _pq.isExplain() ) {
+ _eb.noteScan( _c.get(), _nscanned, _nscannedObjects, _n, scanAndOrderRequired(),
+ _curop.elapsedMillis(), useHints && !_pq.getHint().eoo(), _nYields ,
+ _nChunkSkips, _keyFieldsOnly.get() > 0 );
+ }
+ else {
+ if ( _buf.len() ) {
+ _response.appendData( _buf.buf(), _buf.len() );
+ _buf.decouple();
+ }
+ }
+
+ if ( stop ) {
+ setStop();
+ }
+ else {
+ setComplete();
+ }
+
+ }
+
+ void finishExplain( const BSONObj &suffix ) {
+ BSONObj obj = _eb.finishWithSuffix( totalNscanned(), nscannedObjects(), n(), _curop.elapsedMillis(), suffix);
+ fillQueryResultFromObj(_buf, 0, obj);
+ _n = 1;
+ _oldN = 0;
+ _response.appendData( _buf.buf(), _buf.len() );
+ _buf.decouple();
+ }
+
+ virtual bool mayRecordPlan() const {
+ return !_yieldRecoveryFailed && ( _pq.getNumToReturn() != 1 ) && ( ( _n > _pq.getNumToReturn() / 2 ) || ( complete() && !stopRequested() ) );
+ }
+
+ virtual QueryOp *_createChild() const {
+ if ( _pq.isExplain() ) {
+ _eb.ensureStartScan();
+ }
+ UserQueryOp *ret = new UserQueryOp( _pq, _response, _eb, _curop );
+ ret->_oldN = n();
+ ret->_oldNscanned = totalNscanned();
+ ret->_oldNscannedObjects = nscannedObjects();
+ ret->_ntoskip = _ntoskip;
+ return ret;
+ }
+
+ bool scanAndOrderRequired() const { return _inMemSort; }
+ shared_ptr<Cursor> cursor() { return _c; }
+ int n() const { return _oldN + _n; }
+ long long totalNscanned() const { return _nscanned + _oldNscanned; }
+ long long nscannedObjects() const { return _nscannedObjects + _oldNscannedObjects; }
+ bool saveClientCursor() const { return _saveClientCursor; }
+ bool wouldSaveClientCursor() const { return _wouldSaveClientCursor; }
+
+ void finishForOplogReplay( ClientCursor * cc ) {
+ if ( _oplogReplay && ! _slaveReadTill.isNull() )
+ cc->slaveReadTill( _slaveReadTill );
+
+ }
+
+ ShardChunkManagerPtr getChunkManager(){ return _chunkManager; }
+
+ private:
+ BufBuilder _buf;
+ const ParsedQuery& _pq;
+ scoped_ptr<Projection::KeyOnly> _keyFieldsOnly;
+
+ long long _ntoskip;
+ long long _nscanned;
+ long long _oldNscanned;
+ long long _nscannedObjects;
+ long long _oldNscannedObjects;
+ int _n; // found so far
+ int _oldN;
+
+ int _nYields;
+ int _nChunkSkips;
+
+ MatchDetails _details;
+
+ ShardChunkManagerPtr _chunkManager;
+
+ bool _inMemSort;
+ auto_ptr< ScanAndOrder > _so;
+
+ shared_ptr<Cursor> _c;
+ ClientCursor::CleanupPointer _cc;
+ ClientCursor::YieldData _yieldData;
+
+ bool _capped;
+ bool _saveClientCursor;
+ bool _wouldSaveClientCursor;
+ bool _oplogReplay;
+ auto_ptr< FindingStartCursor > _findingStartCursor;
+
+ Message &_response;
+ ExplainBuilder &_eb;
+ CurOp &_curop;
+ OpTime _slaveReadTill;
+
+ bool _yieldRecoveryFailed;
+ };
+
+ /* run a query -- includes checking for and running a Command \
+ @return points to ns if exhaust mode. 0=normal mode
+ */
+ const char *runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) {
+ shared_ptr<ParsedQuery> pq_shared( new ParsedQuery(q) );
+ ParsedQuery& pq( *pq_shared );
+ int ntoskip = q.ntoskip;
+ BSONObj jsobj = q.query;
+ int queryOptions = q.queryOptions;
+ const char *ns = q.ns;
+
+ if( logLevel >= 2 )
+ log() << "runQuery called " << ns << " " << jsobj << endl;
+
+ curop.debug().ns = ns;
+ curop.debug().ntoreturn = pq.getNumToReturn();
+ curop.setQuery(jsobj);
+
+ if ( pq.couldBeCommand() ) {
+ BufBuilder bb;
+ bb.skip(sizeof(QueryResult));
+ BSONObjBuilder cmdResBuf;
+ if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) {
+ curop.debug().iscommand = true;
+ curop.debug().query = jsobj;
+ curop.markCommand();
+
+ auto_ptr< QueryResult > qr;
+ qr.reset( (QueryResult *) bb.buf() );
+ bb.decouple();
+ qr->setResultFlagsToOk();
+ qr->len = bb.len();
+ curop.debug().responseLength = bb.len();
+ qr->setOperation(opReply);
+ qr->cursorId = 0;
+ qr->startingFrom = 0;
+ qr->nReturned = 1;
+ result.setData( qr.release(), true );
+ }
+ else {
+ uasserted(13530, "bad or malformed command request?");
+ }
+ return 0;
+ }
+
+ /* --- regular query --- */
+
+ int n = 0;
+ BSONElement hint = useHints ? pq.getHint() : BSONElement();
+ bool explain = pq.isExplain();
+ bool snapshot = pq.isSnapshot();
+ BSONObj order = pq.getOrder();
+ BSONObj query = pq.getFilter();
+
+ /* The ElemIter will not be happy if this isn't really an object. So throw exception
+ here when that is true.
+ (Which may indicate bad data from client.)
+ */
+ if ( query.objsize() == 0 ) {
+ out() << "Bad query object?\n jsobj:";
+ out() << jsobj.toString() << "\n query:";
+ out() << query.toString() << endl;
+ uassert( 10110 , "bad query object", false);
+ }
+
+ Client::ReadContext ctx( ns , dbpath ); // read locks
+
+ replVerifyReadsOk(pq);
+
+ if ( pq.hasOption( QueryOption_CursorTailable ) ) {
+ NamespaceDetails *d = nsdetails( ns );
+ uassert( 13051, "tailable cursor requested on non capped collection", d && d->capped );
+ const BSONObj nat1 = BSON( "$natural" << 1 );
+ if ( order.isEmpty() ) {
+ order = nat1;
+ }
+ else {
+ uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == nat1 );
+ }
+ }
+
+ BSONObj snapshotHint; // put here to keep the data in scope
+ if( snapshot ) {
+ NamespaceDetails *d = nsdetails(ns);
+ if ( d ) {
+ int i = d->findIdIndex();
+ if( i < 0 ) {
+ if ( strstr( ns , ".system." ) == 0 )
+ log() << "warning: no _id index on $snapshot query, ns:" << ns << endl;
+ }
+ else {
+ /* [dm] the name of an _id index tends to vary, so we build the hint the hard way here.
+ probably need a better way to specify "use the _id index" as a hint. if someone is
+ in the query optimizer please fix this then!
+ */
+ BSONObjBuilder b;
+ b.append("$hint", d->idx(i).indexName());
+ snapshotHint = b.obj();
+ hint = snapshotHint.firstElement();
+ }
+ }
+ }
+
+ if ( ! (explain || pq.showDiskLoc()) && isSimpleIdQuery( query ) && !pq.hasOption( QueryOption_CursorTailable ) ) {
+
+ bool nsFound = false;
+ bool indexFound = false;
+
+ BSONObj resObject;
+ Client& c = cc();
+ bool found = Helpers::findById( c, ns , query , resObject , &nsFound , &indexFound );
+ if ( nsFound == false || indexFound == true ) {
+ BufBuilder bb(sizeof(QueryResult)+resObject.objsize()+32);
+ bb.skip(sizeof(QueryResult));
+
+ curop.debug().idhack = true;
+ if ( found ) {
+ n = 1;
+ fillQueryResultFromObj( bb , pq.getFields() , resObject );
+ }
+ auto_ptr< QueryResult > qr;
+ qr.reset( (QueryResult *) bb.buf() );
+ bb.decouple();
+ qr->setResultFlagsToOk();
+ qr->len = bb.len();
+
+ curop.debug().responseLength = bb.len();
+ qr->setOperation(opReply);
+ qr->cursorId = 0;
+ qr->startingFrom = 0;
+ qr->nReturned = n;
+ result.setData( qr.release(), true );
+ return NULL;
+ }
+ }
+
+ // regular, not QO bypass query
+
+ BSONObj oldPlan;
+ if ( explain && ! pq.hasIndexSpecifier() ) {
+ MultiPlanScanner mps( ns, query, order );
+ if ( mps.usingCachedPlan() )
+ oldPlan = mps.oldExplain();
+ }
+ auto_ptr< MultiPlanScanner > mps( new MultiPlanScanner( ns, query, order, &hint, !explain, pq.getMin(), pq.getMax(), false, true ) );
+ BSONObj explainSuffix;
+ if ( explain ) {
+ BSONObjBuilder bb;
+ if ( !oldPlan.isEmpty() )
+ bb.append( "oldPlan", oldPlan.firstElement().embeddedObject().firstElement().embeddedObject() );
+ explainSuffix = bb.obj();
+ }
+ ExplainBuilder eb;
+ UserQueryOp original( pq, result, eb, curop );
+ shared_ptr< UserQueryOp > o = mps->runOp( original );
+ UserQueryOp &dqo = *o;
+ if ( ! dqo.complete() )
+ throw MsgAssertionException( dqo.exception() );
+ if ( explain ) {
+ dqo.finishExplain( explainSuffix );
+ }
+ n = dqo.n();
+ long long nscanned = dqo.totalNscanned();
+ curop.debug().scanAndOrder = dqo.scanAndOrderRequired();
+
+ shared_ptr<Cursor> cursor = dqo.cursor();
+ if( logLevel >= 5 )
+ log() << " used cursor: " << cursor.get() << endl;
+ long long cursorid = 0;
+ const char * exhaust = 0;
+ if ( dqo.saveClientCursor() || ( dqo.wouldSaveClientCursor() && mps->mayRunMore() ) ) {
+ ClientCursor *cc;
+ bool moreClauses = mps->mayRunMore();
+ if ( moreClauses ) {
+ // this MultiCursor will use a dumb NoOp to advance(), so no need to specify mayYield
+ shared_ptr< Cursor > multi( new MultiCursor( mps, cursor, dqo.matcher( cursor ), dqo ) );
+ cc = new ClientCursor(queryOptions, multi, ns, jsobj.getOwned());
+ }
+ else {
+ if( ! cursor->matcher() ) cursor->setMatcher( dqo.matcher( cursor ) );
+ cc = new ClientCursor( queryOptions, cursor, ns, jsobj.getOwned() );
+ }
+
+ cc->setChunkManager( dqo.getChunkManager() );
+
+ cursorid = cc->cursorid();
+ DEV tlog(2) << "query has more, cursorid: " << cursorid << endl;
+ cc->setPos( n );
+ cc->pq = pq_shared;
+ cc->fields = pq.getFieldPtr();
+ cc->originalMessage = m;
+ cc->updateLocation();
+ if ( !cc->ok() && cc->c()->tailable() )
+ DEV tlog() << "query has no more but tailable, cursorid: " << cursorid << endl;
+ if( queryOptions & QueryOption_Exhaust ) {
+ exhaust = ns;
+ curop.debug().exhaust = true;
+ }
+ dqo.finishForOplogReplay(cc);
+ }
+
+ QueryResult *qr = (QueryResult *) result.header();
+ qr->cursorId = cursorid;
+ qr->setResultFlagsToOk();
+ // qr->len is updated automatically by appendData()
+ curop.debug().responseLength = qr->len;
+ qr->setOperation(opReply);
+ qr->startingFrom = 0;
+ qr->nReturned = n;
+
+ int duration = curop.elapsedMillis();
+ bool dbprofile = curop.shouldDBProfile( duration );
+ if ( dbprofile || duration >= cmdLine.slowMS ) {
+ curop.debug().nscanned = (int) nscanned;
+ curop.debug().ntoskip = ntoskip;
+ }
+ curop.debug().nreturned = n;
+ return exhaust;
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/ops/query.h b/src/mongo/db/ops/query.h
new file mode 100644
index 00000000000..3324b75fe16
--- /dev/null
+++ b/src/mongo/db/ops/query.h
@@ -0,0 +1,248 @@
+// query.h
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../../pch.h"
+#include "../../util/net/message.h"
+#include "../dbmessage.h"
+#include "../jsobj.h"
+#include "../diskloc.h"
+#include "../projection.h"
+
+// struct QueryOptions, QueryResult, QueryResultFlags in:
+#include "../../client/dbclient.h"
+
+namespace mongo {
+
+ extern const int MaxBytesToReturnToClientAtOnce;
+
+ QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& op, int pass, bool& exhaust);
+
+ const char * runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result);
+
+ /* This is for languages whose "objects" are not well ordered (JSON is well ordered).
+ [ { a : ... } , { b : ... } ] -> { a : ..., b : ... }
+ */
+ inline BSONObj transformOrderFromArrayFormat(BSONObj order) {
+ /* note: this is slow, but that is ok as order will have very few pieces */
+ BSONObjBuilder b;
+ char p[2] = "0";
+
+ while ( 1 ) {
+ BSONObj j = order.getObjectField(p);
+ if ( j.isEmpty() )
+ break;
+ BSONElement e = j.firstElement();
+ uassert( 10102 , "bad order array", !e.eoo());
+ uassert( 10103 , "bad order array [2]", e.isNumber());
+ b.append(e);
+ (*p)++;
+ uassert( 10104 , "too many ordering elements", *p <= '9');
+ }
+
+ return b.obj();
+ }
+
+ /**
+ * this represents a total user query
+ * includes fields from the query message, both possible query levels
+ * parses everything up front
+ */
+ class ParsedQuery : boost::noncopyable {
+ public:
+ ParsedQuery( QueryMessage& qm )
+ : _ns( qm.ns ) , _ntoskip( qm.ntoskip ) , _ntoreturn( qm.ntoreturn ) , _options( qm.queryOptions ) {
+ init( qm.query );
+ initFields( qm.fields );
+ }
+ ParsedQuery( const char* ns , int ntoskip , int ntoreturn , int queryoptions , const BSONObj& query , const BSONObj& fields )
+ : _ns( ns ) , _ntoskip( ntoskip ) , _ntoreturn( ntoreturn ) , _options( queryoptions ) {
+ init( query );
+ initFields( fields );
+ }
+
+ const char * ns() const { return _ns; }
+ bool isLocalDB() const { return strncmp(_ns, "local.", 6) == 0; }
+
+ const BSONObj& getFilter() const { return _filter; }
+ Projection* getFields() const { return _fields.get(); }
+ shared_ptr<Projection> getFieldPtr() const { return _fields; }
+
+ int getSkip() const { return _ntoskip; }
+ int getNumToReturn() const { return _ntoreturn; }
+ bool wantMore() const { return _wantMore; }
+ int getOptions() const { return _options; }
+ bool hasOption( int x ) const { return x & _options; }
+
+ bool isExplain() const { return _explain; }
+ bool isSnapshot() const { return _snapshot; }
+ bool returnKey() const { return _returnKey; }
+ bool showDiskLoc() const { return _showDiskLoc; }
+
+ const BSONObj& getMin() const { return _min; }
+ const BSONObj& getMax() const { return _max; }
+ const BSONObj& getOrder() const { return _order; }
+ const BSONElement& getHint() const { return _hint; }
+ int getMaxScan() const { return _maxScan; }
+
+ bool couldBeCommand() const {
+ /* we assume you are using findOne() for running a cmd... */
+ return _ntoreturn == 1 && strstr( _ns , ".$cmd" );
+ }
+
+ bool hasIndexSpecifier() const {
+ return ! _hint.eoo() || ! _min.isEmpty() || ! _max.isEmpty();
+ }
+
+ /* if ntoreturn is zero, we return up to 101 objects. on the subsequent getmore, there
+ is only a size limit. The idea is that on a find() where one doesn't use much results,
+ we don't return much, but once getmore kicks in, we start pushing significant quantities.
+
+ The n limit (vs. size) is important when someone fetches only one small field from big
+ objects, which causes massive scanning server-side.
+ */
+ bool enoughForFirstBatch( int n , int len ) const {
+ if ( _ntoreturn == 0 )
+ return ( len > 1024 * 1024 ) || n >= 101;
+ return n >= _ntoreturn || len > MaxBytesToReturnToClientAtOnce;
+ }
+
+ bool enough( int n ) const {
+ if ( _ntoreturn == 0 )
+ return false;
+ return n >= _ntoreturn;
+ }
+
+ private:
+ void init( const BSONObj& q ) {
+ _reset();
+ uassert( 10105 , "bad skip value in query", _ntoskip >= 0);
+
+ if ( _ntoreturn < 0 ) {
+ /* _ntoreturn greater than zero is simply a hint on how many objects to send back per
+ "cursor batch".
+ A negative number indicates a hard limit.
+ */
+ _wantMore = false;
+ _ntoreturn = -_ntoreturn;
+ }
+
+
+ BSONElement e = q["query"];
+ if ( ! e.isABSONObj() )
+ e = q["$query"];
+
+ if ( e.isABSONObj() ) {
+ _filter = e.embeddedObject();
+ _initTop( q );
+ }
+ else {
+ _filter = q;
+ }
+ }
+
+ void _reset() {
+ _wantMore = true;
+ _explain = false;
+ _snapshot = false;
+ _returnKey = false;
+ _showDiskLoc = false;
+ _maxScan = 0;
+ }
+
+ void _initTop( const BSONObj& top ) {
+ BSONObjIterator i( top );
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ const char * name = e.fieldName();
+
+ if ( strcmp( "$orderby" , name ) == 0 ||
+ strcmp( "orderby" , name ) == 0 ) {
+ if ( e.type() == Object ) {
+ _order = e.embeddedObject();
+ }
+ else if ( e.type() == Array ) {
+ _order = transformOrderFromArrayFormat( _order );
+ }
+ else {
+ uasserted(13513, "sort must be an object or array");
+ }
+ continue;
+ }
+
+ if( *name == '$' ) {
+ name++;
+ if ( strcmp( "explain" , name ) == 0 )
+ _explain = e.trueValue();
+ else if ( strcmp( "snapshot" , name ) == 0 )
+ _snapshot = e.trueValue();
+ else if ( strcmp( "min" , name ) == 0 )
+ _min = e.embeddedObject();
+ else if ( strcmp( "max" , name ) == 0 )
+ _max = e.embeddedObject();
+ else if ( strcmp( "hint" , name ) == 0 )
+ _hint = e;
+ else if ( strcmp( "returnKey" , name ) == 0 )
+ _returnKey = e.trueValue();
+ else if ( strcmp( "maxScan" , name ) == 0 )
+ _maxScan = e.numberInt();
+ else if ( strcmp( "showDiskLoc" , name ) == 0 )
+ _showDiskLoc = e.trueValue();
+ else if ( strcmp( "comment" , name ) == 0 ) {
+ ; // no-op
+ }
+ }
+ }
+
+ if ( _snapshot ) {
+ uassert( 12001 , "E12001 can't sort with $snapshot", _order.isEmpty() );
+ uassert( 12002 , "E12002 can't use hint with $snapshot", _hint.eoo() );
+ }
+
+ }
+
+ void initFields( const BSONObj& fields ) {
+ if ( fields.isEmpty() )
+ return;
+ _fields.reset( new Projection() );
+ _fields->init( fields );
+ }
+
+ const char * const _ns;
+ const int _ntoskip;
+ int _ntoreturn;
+ BSONObj _filter;
+ BSONObj _order;
+ const int _options;
+ shared_ptr< Projection > _fields;
+ bool _wantMore;
+ bool _explain;
+ bool _snapshot;
+ bool _returnKey;
+ bool _showDiskLoc;
+ BSONObj _min;
+ BSONObj _max;
+ BSONElement _hint;
+ int _maxScan;
+ };
+
+
+} // namespace mongo
+
+
diff --git a/src/mongo/db/ops/update.cpp b/src/mongo/db/ops/update.cpp
new file mode 100644
index 00000000000..2abc6987218
--- /dev/null
+++ b/src/mongo/db/ops/update.cpp
@@ -0,0 +1,1308 @@
+// update.cpp
+
+/**
+ * Copyright (C) 2008 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "query.h"
+#include "../pdfile.h"
+#include "../jsobjmanipulator.h"
+#include "../queryoptimizer.h"
+#include "../repl.h"
+#include "../btree.h"
+#include "../../util/stringutils.h"
+#include "update.h"
+
+//#define DEBUGUPDATE(x) cout << x << endl;
+#define DEBUGUPDATE(x)
+
+namespace mongo {
+
+ const char* Mod::modNames[] = { "$inc", "$set", "$push", "$pushAll", "$pull", "$pullAll" , "$pop", "$unset" ,
+ "$bitand" , "$bitor" , "$bit" , "$addToSet", "$rename", "$rename"
+ };
+ unsigned Mod::modNamesNum = sizeof(Mod::modNames)/sizeof(char*);
+
+ bool Mod::_pullElementMatch( BSONElement& toMatch ) const {
+
+ if ( elt.type() != Object ) {
+ // if elt isn't an object, then comparison will work
+ return toMatch.woCompare( elt , false ) == 0;
+ }
+
+ if ( matcherOnPrimitive )
+ return matcher->matches( toMatch.wrap( "" ) );
+
+ if ( toMatch.type() != Object ) {
+ // looking for an object, so this can't match
+ return false;
+ }
+
+ // now we have an object on both sides
+ return matcher->matches( toMatch.embeddedObject() );
+ }
+
+ template< class Builder >
+ void Mod::appendIncremented( Builder& bb , const BSONElement& in, ModState& ms ) const {
+ BSONType a = in.type();
+ BSONType b = elt.type();
+
+ if ( a == NumberDouble || b == NumberDouble ) {
+ ms.incType = NumberDouble;
+ ms.incdouble = elt.numberDouble() + in.numberDouble();
+ }
+ else if ( a == NumberLong || b == NumberLong ) {
+ ms.incType = NumberLong;
+ ms.inclong = elt.numberLong() + in.numberLong();
+ }
+ else {
+ int x = elt.numberInt() + in.numberInt();
+ if ( x < 0 && elt.numberInt() > 0 && in.numberInt() > 0 ) {
+ // overflow
+ ms.incType = NumberLong;
+ ms.inclong = elt.numberLong() + in.numberLong();
+ }
+ else {
+ ms.incType = NumberInt;
+ ms.incint = elt.numberInt() + in.numberInt();
+ }
+ }
+
+ ms.appendIncValue( bb , false );
+ }
+
+ template< class Builder >
+ void appendUnset( Builder &b ) {
+ }
+
+ template<>
+ void appendUnset( BSONArrayBuilder &b ) {
+ b.appendNull();
+ }
+
+ template< class Builder >
+ void Mod::apply( Builder& b , BSONElement in , ModState& ms ) const {
+ if ( ms.dontApply ) {
+ return;
+ }
+
+ switch ( op ) {
+
+ case INC: {
+ appendIncremented( b , in , ms );
+ break;
+ }
+
+ case SET: {
+ _checkForAppending( elt );
+ b.appendAs( elt , shortFieldName );
+ break;
+ }
+
+ case UNSET: {
+ appendUnset( b );
+ break;
+ }
+
+ case PUSH: {
+ uassert( 10131 , "$push can only be applied to an array" , in.type() == Array );
+ BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+ BSONObjIterator i( in.embeddedObject() );
+ int n=0;
+ while ( i.more() ) {
+ bb.append( i.next() );
+ n++;
+ }
+
+ ms.pushStartSize = n;
+
+ bb.appendAs( elt , bb.numStr( n ) );
+ bb.done();
+ break;
+ }
+
+ case ADDTOSET: {
+ uassert( 12592 , "$addToSet can only be applied to an array" , in.type() == Array );
+ BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+
+ BSONObjIterator i( in.embeddedObject() );
+ int n=0;
+
+ if ( isEach() ) {
+
+ BSONElementSet toadd;
+ parseEach( toadd );
+
+ while ( i.more() ) {
+ BSONElement cur = i.next();
+ bb.append( cur );
+ n++;
+ toadd.erase( cur );
+ }
+
+ {
+ BSONObjIterator i( getEach() );
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ if ( toadd.count(e) ) {
+ bb.appendAs( e , BSONObjBuilder::numStr( n++ ) );
+ toadd.erase( e );
+ }
+ }
+ }
+
+ }
+ else {
+
+ bool found = false;
+
+ while ( i.more() ) {
+ BSONElement cur = i.next();
+ bb.append( cur );
+ n++;
+ if ( elt.woCompare( cur , false ) == 0 )
+ found = true;
+ }
+
+ if ( ! found )
+ bb.appendAs( elt , bb.numStr( n ) );
+
+ }
+
+ bb.done();
+ break;
+ }
+
+
+
+ case PUSH_ALL: {
+ uassert( 10132 , "$pushAll can only be applied to an array" , in.type() == Array );
+ uassert( 10133 , "$pushAll has to be passed an array" , elt.type() );
+
+ BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+
+ BSONObjIterator i( in.embeddedObject() );
+ int n=0;
+ while ( i.more() ) {
+ bb.append( i.next() );
+ n++;
+ }
+
+ ms.pushStartSize = n;
+
+ i = BSONObjIterator( elt.embeddedObject() );
+ while ( i.more() ) {
+ bb.appendAs( i.next() , bb.numStr( n++ ) );
+ }
+
+ bb.done();
+ break;
+ }
+
+ case PULL:
+ case PULL_ALL: {
+ uassert( 10134 , "$pull/$pullAll can only be applied to an array" , in.type() == Array );
+ BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+
+ int n = 0;
+
+ BSONObjIterator i( in.embeddedObject() );
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ bool allowed = true;
+
+ if ( op == PULL ) {
+ allowed = ! _pullElementMatch( e );
+ }
+ else {
+ BSONObjIterator j( elt.embeddedObject() );
+ while( j.more() ) {
+ BSONElement arrJ = j.next();
+ if ( e.woCompare( arrJ, false ) == 0 ) {
+ allowed = false;
+ break;
+ }
+ }
+ }
+
+ if ( allowed )
+ bb.appendAs( e , bb.numStr( n++ ) );
+ }
+
+ bb.done();
+ break;
+ }
+
+ case POP: {
+ uassert( 10135 , "$pop can only be applied to an array" , in.type() == Array );
+ BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+
+ int n = 0;
+
+ BSONObjIterator i( in.embeddedObject() );
+ if ( elt.isNumber() && elt.number() < 0 ) {
+ // pop from front
+ if ( i.more() ) {
+ i.next();
+ n++;
+ }
+
+ while( i.more() ) {
+ bb.appendAs( i.next() , bb.numStr( n - 1 ) );
+ n++;
+ }
+ }
+ else {
+ // pop from back
+ while( i.more() ) {
+ n++;
+ BSONElement arrI = i.next();
+ if ( i.more() ) {
+ bb.append( arrI );
+ }
+ }
+ }
+
+ ms.pushStartSize = n;
+ assert( ms.pushStartSize == in.embeddedObject().nFields() );
+ bb.done();
+ break;
+ }
+
+ case BIT: {
+ uassert( 10136 , "$bit needs an array" , elt.type() == Object );
+ uassert( 10137 , "$bit can only be applied to numbers" , in.isNumber() );
+ uassert( 10138 , "$bit cannot update a value of type double" , in.type() != NumberDouble );
+
+ int x = in.numberInt();
+ long long y = in.numberLong();
+
+ BSONObjIterator it( elt.embeddedObject() );
+ while ( it.more() ) {
+ BSONElement e = it.next();
+ uassert( 10139 , "$bit field must be number" , e.isNumber() );
+ if ( str::equals(e.fieldName(), "and") ) {
+ switch( in.type() ) {
+ case NumberInt: x = x&e.numberInt(); break;
+ case NumberLong: y = y&e.numberLong(); break;
+ default: assert( 0 );
+ }
+ }
+ else if ( str::equals(e.fieldName(), "or") ) {
+ switch( in.type() ) {
+ case NumberInt: x = x|e.numberInt(); break;
+ case NumberLong: y = y|e.numberLong(); break;
+ default: assert( 0 );
+ }
+ }
+ else {
+ uasserted(9016, str::stream() << "unknown $bit operation: " << e.fieldName());
+ }
+ }
+
+ switch( in.type() ) {
+ case NumberInt: b.append( shortFieldName , x ); break;
+ case NumberLong: b.append( shortFieldName , y ); break;
+ default: assert( 0 );
+ }
+
+ break;
+ }
+
+ case RENAME_FROM: {
+ break;
+ }
+
+ case RENAME_TO: {
+ ms.handleRename( b, shortFieldName );
+ break;
+ }
+
+ default:
+ stringstream ss;
+ ss << "Mod::apply can't handle type: " << op;
+ throw UserException( 9017, ss.str() );
+ }
+ }
+
+ // -1 inside a non-object (non-object could be array)
+ // 0 missing
+ // 1 found
+ int validRenamePath( BSONObj obj, const char *path ) {
+ while( const char *p = strchr( path, '.' ) ) {
+ string left( path, p - path );
+ BSONElement e = obj.getField( left );
+ if ( e.eoo() ) {
+ return 0;
+ }
+ if ( e.type() != Object ) {
+ return -1;
+ }
+ obj = e.embeddedObject();
+ path = p + 1;
+ }
+ return !obj.getField( path ).eoo();
+ }
+
+ auto_ptr<ModSetState> ModSet::prepare(const BSONObj &obj) const {
+ DEBUGUPDATE( "\t start prepare" );
+ auto_ptr<ModSetState> mss( new ModSetState( obj ) );
+
+
+ // Perform this check first, so that we don't leave a partially modified object on uassert.
+ for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); ++i ) {
+ DEBUGUPDATE( "\t\t prepare : " << i->first );
+ ModState& ms = mss->_mods[i->first];
+
+ const Mod& m = i->second;
+ BSONElement e = obj.getFieldDotted(m.fieldName);
+
+ ms.m = &m;
+ ms.old = e;
+
+ if ( m.op == Mod::RENAME_FROM ) {
+ int source = validRenamePath( obj, m.fieldName );
+ uassert( 13489, "$rename source field invalid", source != -1 );
+ if ( source != 1 ) {
+ ms.dontApply = true;
+ }
+ continue;
+ }
+
+ if ( m.op == Mod::RENAME_TO ) {
+ int source = validRenamePath( obj, m.renameFrom() );
+ if ( source == 1 ) {
+ int target = validRenamePath( obj, m.fieldName );
+ uassert( 13490, "$rename target field invalid", target != -1 );
+ ms.newVal = obj.getFieldDotted( m.renameFrom() );
+ mss->amIInPlacePossible( false );
+ }
+ else {
+ ms.dontApply = true;
+ }
+ continue;
+ }
+
+ if ( e.eoo() ) {
+ mss->amIInPlacePossible( m.op == Mod::UNSET );
+ continue;
+ }
+
+ switch( m.op ) {
+ case Mod::INC:
+ uassert( 10140 , "Cannot apply $inc modifier to non-number", e.isNumber() || e.eoo() );
+ if ( mss->amIInPlacePossible( e.isNumber() ) ) {
+ // check more typing info here
+ if ( m.elt.type() != e.type() ) {
+ // if i'm incrementing with a double, then the storage has to be a double
+ mss->amIInPlacePossible( m.elt.type() != NumberDouble );
+ }
+
+ // check for overflow
+ if ( e.type() == NumberInt && e.numberLong() + m.elt.numberLong() > numeric_limits<int>::max() ) {
+ mss->amIInPlacePossible( false );
+ }
+ }
+ break;
+
+ case Mod::SET:
+ mss->amIInPlacePossible( m.elt.type() == e.type() &&
+ m.elt.valuesize() == e.valuesize() );
+ break;
+
+ case Mod::PUSH:
+ case Mod::PUSH_ALL:
+ uassert( 10141 , "Cannot apply $push/$pushAll modifier to non-array", e.type() == Array || e.eoo() );
+ mss->amIInPlacePossible( false );
+ break;
+
+ case Mod::PULL:
+ case Mod::PULL_ALL: {
+ uassert( 10142 , "Cannot apply $pull/$pullAll modifier to non-array", e.type() == Array || e.eoo() );
+ BSONObjIterator i( e.embeddedObject() );
+ while( mss->_inPlacePossible && i.more() ) {
+ BSONElement arrI = i.next();
+ if ( m.op == Mod::PULL ) {
+ mss->amIInPlacePossible( ! m._pullElementMatch( arrI ) );
+ }
+ else if ( m.op == Mod::PULL_ALL ) {
+ BSONObjIterator j( m.elt.embeddedObject() );
+ while( mss->_inPlacePossible && j.moreWithEOO() ) {
+ BSONElement arrJ = j.next();
+ if ( arrJ.eoo() )
+ break;
+ mss->amIInPlacePossible( arrI.woCompare( arrJ, false ) );
+ }
+ }
+ }
+ break;
+ }
+
+ case Mod::POP: {
+ uassert( 10143 , "Cannot apply $pop modifier to non-array", e.type() == Array || e.eoo() );
+ mss->amIInPlacePossible( e.embeddedObject().isEmpty() );
+ break;
+ }
+
+ case Mod::ADDTOSET: {
+ uassert( 12591 , "Cannot apply $addToSet modifier to non-array", e.type() == Array || e.eoo() );
+
+ BSONObjIterator i( e.embeddedObject() );
+ if ( m.isEach() ) {
+ BSONElementSet toadd;
+ m.parseEach( toadd );
+ while( i.more() ) {
+ BSONElement arrI = i.next();
+ toadd.erase( arrI );
+ }
+ mss->amIInPlacePossible( toadd.size() == 0 );
+ }
+ else {
+ bool found = false;
+ while( i.more() ) {
+ BSONElement arrI = i.next();
+ if ( arrI.woCompare( m.elt , false ) == 0 ) {
+ found = true;
+ break;
+ }
+ }
+ mss->amIInPlacePossible( found );
+ }
+ break;
+ }
+
+ default:
+ // mods we don't know about shouldn't be done in place
+ mss->amIInPlacePossible( false );
+ }
+ }
+
+ DEBUGUPDATE( "\t mss\n" << mss->toString() << "\t--" );
+
+ return mss;
+ }
+
+ void ModState::appendForOpLog( BSONObjBuilder& b ) const {
+ if ( dontApply ) {
+ return;
+ }
+
+ if ( incType ) {
+ DEBUGUPDATE( "\t\t\t\t\t appendForOpLog inc fieldname: " << m->fieldName << " short:" << m->shortFieldName );
+ BSONObjBuilder bb( b.subobjStart( "$set" ) );
+ appendIncValue( bb , true );
+ bb.done();
+ return;
+ }
+
+ if ( m->op == Mod::RENAME_FROM ) {
+ DEBUGUPDATE( "\t\t\t\t\t appendForOpLog RENAME_FROM fieldName:" << m->fieldName );
+ BSONObjBuilder bb( b.subobjStart( "$unset" ) );
+ bb.append( m->fieldName, 1 );
+ bb.done();
+ return;
+ }
+
+ if ( m->op == Mod::RENAME_TO ) {
+ DEBUGUPDATE( "\t\t\t\t\t appendForOpLog RENAME_TO fieldName:" << m->fieldName );
+ BSONObjBuilder bb( b.subobjStart( "$set" ) );
+ bb.appendAs( newVal, m->fieldName );
+ return;
+ }
+
+ const char * name = fixedOpName ? fixedOpName : Mod::modNames[op()];
+
+ DEBUGUPDATE( "\t\t\t\t\t appendForOpLog name:" << name << " fixed: " << fixed << " fn: " << m->fieldName );
+
+ BSONObjBuilder bb( b.subobjStart( name ) );
+ if ( fixed ) {
+ bb.appendAs( *fixed , m->fieldName );
+ }
+ else {
+ bb.appendAs( m->elt , m->fieldName );
+ }
+ bb.done();
+ }
+
+ string ModState::toString() const {
+ stringstream ss;
+ if ( fixedOpName )
+ ss << " fixedOpName: " << fixedOpName;
+ if ( fixed )
+ ss << " fixed: " << fixed;
+ return ss.str();
+ }
+
+ template< class Builder >
+ void ModState::handleRename( Builder &newObjBuilder, const char *shortFieldName ) {
+ newObjBuilder.appendAs( newVal , shortFieldName );
+ BSONObjBuilder b;
+ b.appendAs( newVal, shortFieldName );
+ assert( _objData.isEmpty() );
+ _objData = b.obj();
+ newVal = _objData.firstElement();
+ }
+
+ void ModSetState::applyModsInPlace( bool isOnDisk ) {
+ // TODO i think this assert means that we can get rid of the isOnDisk param
+ // and just use isOwned as the determination
+ DEV assert( isOnDisk == ! _obj.isOwned() );
+
+ for ( ModStateHolder::iterator i = _mods.begin(); i != _mods.end(); ++i ) {
+ ModState& m = i->second;
+
+ if ( m.dontApply ) {
+ continue;
+ }
+
+ switch ( m.m->op ) {
+ case Mod::UNSET:
+ case Mod::ADDTOSET:
+ case Mod::RENAME_FROM:
+ case Mod::RENAME_TO:
+ // this should have been handled by prepare
+ break;
+ case Mod::PULL:
+ case Mod::PULL_ALL:
+ // this should have been handled by prepare
+ break;
+ case Mod::POP:
+ assert( m.old.eoo() || ( m.old.isABSONObj() && m.old.Obj().isEmpty() ) );
+ break;
+ // [dm] the BSONElementManipulator statements below are for replication (correct?)
+ case Mod::INC:
+ if ( isOnDisk )
+ m.m->IncrementMe( m.old );
+ else
+ m.m->incrementMe( m.old );
+ m.fixedOpName = "$set";
+ m.fixed = &(m.old);
+ break;
+ case Mod::SET:
+ if ( isOnDisk )
+ BSONElementManipulator( m.old ).ReplaceTypeAndValue( m.m->elt );
+ else
+ BSONElementManipulator( m.old ).replaceTypeAndValue( m.m->elt );
+ break;
+ default:
+ uassert( 13478 , "can't apply mod in place - shouldn't have gotten here" , 0 );
+ }
+ }
+ }
+
+ void ModSet::extractFields( map< string, BSONElement > &fields, const BSONElement &top, const string &base ) {
+ if ( top.type() != Object ) {
+ fields[ base + top.fieldName() ] = top;
+ return;
+ }
+ BSONObjIterator i( top.embeddedObject() );
+ bool empty = true;
+ while( i.moreWithEOO() ) {
+ BSONElement e = i.next();
+ if ( e.eoo() )
+ break;
+ extractFields( fields, e, base + top.fieldName() + "." );
+ empty = false;
+ }
+ if ( empty )
+ fields[ base + top.fieldName() ] = top;
+ }
+
+ template< class Builder >
+ void ModSetState::_appendNewFromMods( const string& root , ModState& m , Builder& b , set<string>& onedownseen ) {
+ const char * temp = m.fieldName();
+ temp += root.size();
+ const char * dot = strchr( temp , '.' );
+ if ( dot ) {
+ string nr( m.fieldName() , 0 , 1 + ( dot - m.fieldName() ) );
+ string nf( temp , 0 , dot - temp );
+ if ( onedownseen.count( nf ) )
+ return;
+ onedownseen.insert( nf );
+ BSONObjBuilder bb ( b.subobjStart( nf ) );
+ createNewFromMods( nr , bb , BSONObj() ); // don't infer an array from name
+ bb.done();
+ }
+ else {
+ appendNewFromMod( m , b );
+ }
+
+ }
+
+ template< class Builder >
+ void ModSetState::createNewFromMods( const string& root , Builder& b , const BSONObj &obj ) {
+ DEBUGUPDATE( "\t\t createNewFromMods root: " << root );
+ BSONObjIteratorSorted es( obj );
+ BSONElement e = es.next();
+
+ ModStateHolder::iterator m = _mods.lower_bound( root );
+ StringBuilder buf(root.size() + 2 );
+ buf << root << (char)255;
+ ModStateHolder::iterator mend = _mods.lower_bound( buf.str() );
+
+ set<string> onedownseen;
+
+ while ( e.type() && m != mend ) {
+ string field = root + e.fieldName();
+ FieldCompareResult cmp = compareDottedFieldNames( m->second.m->fieldName , field );
+
+ DEBUGUPDATE( "\t\t\t field:" << field << "\t mod:" << m->second.m->fieldName << "\t cmp:" << cmp << "\t short: " << e.fieldName() );
+
+ switch ( cmp ) {
+
+ case LEFT_SUBFIELD: { // Mod is embedded under this element
+ uassert( 10145 , str::stream() << "LEFT_SUBFIELD only supports Object: " << field << " not: " << e.type() , e.type() == Object || e.type() == Array );
+ if ( onedownseen.count( e.fieldName() ) == 0 ) {
+ onedownseen.insert( e.fieldName() );
+ if ( e.type() == Object ) {
+ BSONObjBuilder bb( b.subobjStart( e.fieldName() ) );
+ stringstream nr; nr << root << e.fieldName() << ".";
+ createNewFromMods( nr.str() , bb , e.embeddedObject() );
+ bb.done();
+ }
+ else {
+ BSONArrayBuilder ba( b.subarrayStart( e.fieldName() ) );
+ stringstream nr; nr << root << e.fieldName() << ".";
+ createNewFromMods( nr.str() , ba , e.embeddedObject() );
+ ba.done();
+ }
+ // inc both as we handled both
+ e = es.next();
+ m++;
+ }
+ else {
+ // this is a very weird case
+ // have seen it in production, but can't reproduce
+ // this assert prevents an inf. loop
+ // but likely isn't the correct solution
+ assert(0);
+ }
+ continue;
+ }
+ case LEFT_BEFORE: // Mod on a field that doesn't exist
+ DEBUGUPDATE( "\t\t\t\t creating new field for: " << m->second.m->fieldName );
+ _appendNewFromMods( root , m->second , b , onedownseen );
+ m++;
+ continue;
+ case SAME:
+ DEBUGUPDATE( "\t\t\t\t applying mod on: " << m->second.m->fieldName );
+ m->second.apply( b , e );
+ e = es.next();
+ m++;
+ continue;
+ case RIGHT_BEFORE: // field that doesn't have a MOD
+ DEBUGUPDATE( "\t\t\t\t just copying" );
+ b.append( e ); // if array, ignore field name
+ e = es.next();
+ continue;
+ case RIGHT_SUBFIELD:
+ massert( 10399 , "ModSet::createNewFromMods - RIGHT_SUBFIELD should be impossible" , 0 );
+ break;
+ default:
+ massert( 10400 , "unhandled case" , 0 );
+ }
+ }
+
+ // finished looping the mods, just adding the rest of the elements
+ while ( e.type() ) {
+ DEBUGUPDATE( "\t\t\t copying: " << e.fieldName() );
+ b.append( e ); // if array, ignore field name
+ e = es.next();
+ }
+
+ // do mods that don't have fields already
+ for ( ; m != mend; m++ ) {
+ DEBUGUPDATE( "\t\t\t\t appending from mod at end: " << m->second.m->fieldName );
+ _appendNewFromMods( root , m->second , b , onedownseen );
+ }
+ }
+
+ BSONObj ModSetState::createNewFromMods() {
+ BSONObjBuilder b( (int)(_obj.objsize() * 1.1) );
+ createNewFromMods( "" , b , _obj );
+ return _newFromMods = b.obj();
+ }
+
+ string ModSetState::toString() const {
+ stringstream ss;
+ for ( ModStateHolder::const_iterator i=_mods.begin(); i!=_mods.end(); ++i ) {
+ ss << "\t\t" << i->first << "\t" << i->second.toString() << "\n";
+ }
+ return ss.str();
+ }
+
+ bool ModSetState::FieldCmp::operator()( const string &l, const string &r ) const {
+ return lexNumCmp( l.c_str(), r.c_str() ) < 0;
+ }
+
+ BSONObj ModSet::createNewFromQuery( const BSONObj& query ) {
+ BSONObj newObj;
+
+ {
+ BSONObjBuilder bb;
+ EmbeddedBuilder eb( &bb );
+ BSONObjIteratorSorted i( query );
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ if ( e.fieldName()[0] == '$' ) // for $atomic and anything else we add
+ continue;
+
+ if ( e.type() == Object && e.embeddedObject().firstElementFieldName()[0] == '$' ) {
+ // this means this is a $gt type filter, so don't make part of the new object
+ continue;
+ }
+
+ eb.appendAs( e , e.fieldName() );
+ }
+ eb.done();
+ newObj = bb.obj();
+ }
+
+ auto_ptr<ModSetState> mss = prepare( newObj );
+
+ if ( mss->canApplyInPlace() )
+ mss->applyModsInPlace( false );
+ else
+ newObj = mss->createNewFromMods();
+
+ return newObj;
+ }
+
+ /* get special operations like $inc
+ { $inc: { a:1, b:1 } }
+ { $set: { a:77 } }
+ { $push: { a:55 } }
+ { $pushAll: { a:[77,88] } }
+ { $pull: { a:66 } }
+ { $pullAll : { a:[99,1010] } }
+ NOTE: MODIFIES source from object!
+ */
+ ModSet::ModSet(
+ const BSONObj &from ,
+ const set<string>& idxKeys,
+ const set<string> *backgroundKeys)
+ : _isIndexed(0) , _hasDynamicArray( false ) {
+
+ BSONObjIterator it(from);
+
+ while ( it.more() ) {
+ BSONElement e = it.next();
+ const char *fn = e.fieldName();
+
+ uassert( 10147 , "Invalid modifier specified: " + string( fn ), e.type() == Object );
+ BSONObj j = e.embeddedObject();
+ DEBUGUPDATE( "\t" << j );
+
+ BSONObjIterator jt(j);
+ Mod::Op op = opFromStr( fn );
+
+ while ( jt.more() ) {
+ BSONElement f = jt.next(); // x:44
+
+ const char * fieldName = f.fieldName();
+
+ uassert( 15896 , "Modified field name may not start with $", fieldName[0] != '$' || op == Mod::UNSET ); // allow remove of invalid field name in case it was inserted before this check was added (~ version 2.1)
+ uassert( 10148 , "Mod on _id not allowed", strcmp( fieldName, "_id" ) != 0 );
+ uassert( 10149 , "Invalid mod field name, may not end in a period", fieldName[ strlen( fieldName ) - 1 ] != '.' );
+ uassert( 10150 , "Field name duplication not allowed with modifiers", ! haveModForField( fieldName ) );
+ uassert( 10151 , "have conflicting mods in update" , ! haveConflictingMod( fieldName ) );
+ uassert( 10152 , "Modifier $inc allowed for numbers only", f.isNumber() || op != Mod::INC );
+ uassert( 10153 , "Modifier $pushAll/pullAll allowed for arrays only", f.type() == Array || ( op != Mod::PUSH_ALL && op != Mod::PULL_ALL ) );
+
+ if ( op == Mod::RENAME_TO ) {
+ uassert( 13494, "$rename target must be a string", f.type() == String );
+ const char *target = f.valuestr();
+ uassert( 13495, "$rename source must differ from target", strcmp( fieldName, target ) != 0 );
+ uassert( 13496, "invalid mod field name, source may not be empty", fieldName[0] );
+ uassert( 13479, "invalid mod field name, target may not be empty", target[0] );
+ uassert( 13480, "invalid mod field name, source may not begin or end in period", fieldName[0] != '.' && fieldName[ strlen( fieldName ) - 1 ] != '.' );
+ uassert( 13481, "invalid mod field name, target may not begin or end in period", target[0] != '.' && target[ strlen( target ) - 1 ] != '.' );
+ uassert( 13482, "$rename affecting _id not allowed", !( fieldName[0] == '_' && fieldName[1] == 'i' && fieldName[2] == 'd' && ( !fieldName[3] || fieldName[3] == '.' ) ) );
+ uassert( 13483, "$rename affecting _id not allowed", !( target[0] == '_' && target[1] == 'i' && target[2] == 'd' && ( !target[3] || target[3] == '.' ) ) );
+ uassert( 13484, "field name duplication not allowed with $rename target", !haveModForField( target ) );
+ uassert( 13485, "conflicting mods not allowed with $rename target", !haveConflictingMod( target ) );
+ uassert( 13486, "$rename target may not be a parent of source", !( strncmp( fieldName, target, strlen( target ) ) == 0 && fieldName[ strlen( target ) ] == '.' ) );
+ uassert( 13487, "$rename source may not be dynamic array", strstr( fieldName , ".$" ) == 0 );
+ uassert( 13488, "$rename target may not be dynamic array", strstr( target , ".$" ) == 0 );
+
+ Mod from;
+ from.init( Mod::RENAME_FROM, f );
+ from.setFieldName( fieldName );
+ updateIsIndexed( from, idxKeys, backgroundKeys );
+ _mods[ from.fieldName ] = from;
+
+ Mod to;
+ to.init( Mod::RENAME_TO, f );
+ to.setFieldName( target );
+ updateIsIndexed( to, idxKeys, backgroundKeys );
+ _mods[ to.fieldName ] = to;
+
+ DEBUGUPDATE( "\t\t " << fieldName << "\t" << from.fieldName << "\t" << to.fieldName );
+ continue;
+ }
+
+ _hasDynamicArray = _hasDynamicArray || strstr( fieldName , ".$" ) > 0;
+
+ Mod m;
+ m.init( op , f );
+ m.setFieldName( f.fieldName() );
+ updateIsIndexed( m, idxKeys, backgroundKeys );
+ _mods[m.fieldName] = m;
+
+ DEBUGUPDATE( "\t\t " << fieldName << "\t" << m.fieldName << "\t" << _hasDynamicArray );
+ }
+ }
+
+ }
+
+ ModSet * ModSet::fixDynamicArray( const char * elemMatchKey ) const {
+ ModSet * n = new ModSet();
+ n->_isIndexed = _isIndexed;
+ n->_hasDynamicArray = _hasDynamicArray;
+ for ( ModHolder::const_iterator i=_mods.begin(); i!=_mods.end(); i++ ) {
+ string s = i->first;
+ size_t idx = s.find( ".$" );
+ if ( idx == string::npos ) {
+ n->_mods[s] = i->second;
+ continue;
+ }
+ StringBuilder buf(s.size()+strlen(elemMatchKey));
+ buf << s.substr(0,idx+1) << elemMatchKey << s.substr(idx+2);
+ string fixed = buf.str();
+ DEBUGUPDATE( "fixed dynamic: " << s << " -->> " << fixed );
+ n->_mods[fixed] = i->second;
+ ModHolder::iterator temp = n->_mods.find( fixed );
+ temp->second.setFieldName( temp->first.c_str() );
+ }
+ return n;
+ }
+
+ void checkNoMods( BSONObj o ) {
+ BSONObjIterator i( o );
+ while( i.moreWithEOO() ) {
+ BSONElement e = i.next();
+ if ( e.eoo() )
+ break;
+ uassert( 10154 , "Modifiers and non-modifiers cannot be mixed", e.fieldName()[ 0 ] != '$' );
+ }
+ }
+
+ static void checkTooLarge(const BSONObj& newObj) {
+ uassert( 12522 , "$ operator made object too large" , newObj.objsize() <= BSONObjMaxUserSize );
+ }
+
+ /* note: this is only (as-is) called for
+
+ - not multi
+ - not mods is indexed
+ - not upsert
+ */
+ static UpdateResult _updateById(bool isOperatorUpdate, int idIdxNo, ModSet *mods, int profile, NamespaceDetails *d,
+ NamespaceDetailsTransient *nsdt,
+ bool god, const char *ns,
+ const BSONObj& updateobj, BSONObj patternOrig, bool logop, OpDebug& debug) {
+
+ DiskLoc loc;
+ {
+ IndexDetails& i = d->idx(idIdxNo);
+ BSONObj key = i.getKeyFromQuery( patternOrig );
+ loc = i.idxInterface().findSingle(i, i.head, key);
+ if( loc.isNull() ) {
+ // no upsert support in _updateById yet, so we are done.
+ return UpdateResult(0, 0, 0);
+ }
+ }
+ Record *r = loc.rec();
+
+ if ( ! r->likelyInPhysicalMemory() ) {
+ {
+ scoped_ptr<LockMongoFilesShared> lk( new LockMongoFilesShared() );
+ dbtempreleasewritelock t;
+ r->touch();
+ lk.reset(0); // we have to release mmmutex before we can re-acquire dbmutex
+ }
+
+ {
+ // we need to re-find in case something changed
+ d = nsdetails( ns );
+ if ( ! d ) {
+ // dropped
+ return UpdateResult(0, 0, 0);
+ }
+ nsdt = &NamespaceDetailsTransient::get(ns);
+ IndexDetails& i = d->idx(idIdxNo);
+ BSONObj key = i.getKeyFromQuery( patternOrig );
+ loc = i.idxInterface().findSingle(i, i.head, key);
+ if( loc.isNull() ) {
+ // no upsert support in _updateById yet, so we are done.
+ return UpdateResult(0, 0, 0);
+ }
+
+ r = loc.rec();
+ }
+ }
+
+ /* look for $inc etc. note as listed here, all fields to inc must be this type, you can't set some
+ regular ones at the moment. */
+ if ( isOperatorUpdate ) {
+ const BSONObj& onDisk = loc.obj();
+ auto_ptr<ModSetState> mss = mods->prepare( onDisk );
+
+ if( mss->canApplyInPlace() ) {
+ mss->applyModsInPlace(true);
+ DEBUGUPDATE( "\t\t\t updateById doing in place update" );
+ }
+ else {
+ BSONObj newObj = mss->createNewFromMods();
+ checkTooLarge(newObj);
+ assert(nsdt);
+ theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug);
+ }
+
+ if ( logop ) {
+ DEV assert( mods->size() );
+
+ BSONObj pattern = patternOrig;
+ if ( mss->haveArrayDepMod() ) {
+ BSONObjBuilder patternBuilder;
+ patternBuilder.appendElements( pattern );
+ mss->appendSizeSpecForArrayDepMods( patternBuilder );
+ pattern = patternBuilder.obj();
+ }
+
+ if( mss->needOpLogRewrite() ) {
+ DEBUGUPDATE( "\t rewrite update: " << mss->getOpLogRewrite() );
+ logOp("u", ns, mss->getOpLogRewrite() , &pattern );
+ }
+ else {
+ logOp("u", ns, updateobj, &pattern );
+ }
+ }
+ return UpdateResult( 1 , 1 , 1);
+ } // end $operator update
+
+ // regular update
+ BSONElementManipulator::lookForTimestamps( updateobj );
+ checkNoMods( updateobj );
+ assert(nsdt);
+ theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug );
+ if ( logop ) {
+ logOp("u", ns, updateobj, &patternOrig );
+ }
+ return UpdateResult( 1 , 0 , 1 );
+ }
+
+ UpdateResult _updateObjects(bool god, const char *ns, const BSONObj& updateobj, BSONObj patternOrig, bool upsert, bool multi, bool logop , OpDebug& debug, RemoveSaver* rs ) {
+ DEBUGUPDATE( "update: " << ns << " update: " << updateobj << " query: " << patternOrig << " upsert: " << upsert << " multi: " << multi );
+ Client& client = cc();
+ int profile = client.database()->profile;
+
+ debug.updateobj = updateobj;
+
+ // idea with these here it to make them loop invariant for multi updates, and thus be a bit faster for that case
+ // The pointers may be left invalid on a failed or terminal yield recovery.
+ NamespaceDetails *d = nsdetails(ns); // can be null if an upsert...
+ NamespaceDetailsTransient *nsdt = &NamespaceDetailsTransient::get(ns);
+
+ auto_ptr<ModSet> mods;
+ bool isOperatorUpdate = updateobj.firstElementFieldName()[0] == '$';
+ int modsIsIndexed = false; // really the # of indexes
+ if ( isOperatorUpdate ) {
+ if( d && d->indexBuildInProgress ) {
+ set<string> bgKeys;
+ d->inProgIdx().keyPattern().getFieldNames(bgKeys);
+ mods.reset( new ModSet(updateobj, nsdt->indexKeys(), &bgKeys) );
+ }
+ else {
+ mods.reset( new ModSet(updateobj, nsdt->indexKeys()) );
+ }
+ modsIsIndexed = mods->isIndexed();
+ }
+
+ if( !multi && isSimpleIdQuery(patternOrig) && d && !modsIsIndexed ) {
+ int idxNo = d->findIdIndex();
+ if( idxNo >= 0 ) {
+ debug.idhack = true;
+ UpdateResult result = _updateById(isOperatorUpdate, idxNo, mods.get(), profile, d, nsdt, god, ns, updateobj, patternOrig, logop, debug);
+ if ( result.existing || ! upsert ) {
+ return result;
+ }
+ else if ( upsert && ! isOperatorUpdate && ! logop) {
+ // this handles repl inserts
+ checkNoMods( updateobj );
+ debug.upsert = true;
+ BSONObj no = updateobj;
+ theDataFileMgr.insertWithObjMod(ns, no, god);
+ return UpdateResult( 0 , 0 , 1 , no );
+ }
+ }
+ }
+
+ int numModded = 0;
+ long long nscanned = 0;
+ shared_ptr< Cursor > c = NamespaceDetailsTransient::getCursor( ns, patternOrig );
+
+ d = nsdetails(ns);
+ nsdt = &NamespaceDetailsTransient::get(ns);
+ bool autoDedup = c->autoDedup();
+
+ if( c->ok() ) {
+ set<DiskLoc> seenObjects;
+ MatchDetails details;
+ auto_ptr<ClientCursor> cc;
+ do {
+ nscanned++;
+
+ bool atomic = c->matcher() && c->matcher()->docMatcher().atomic();
+
+ if ( !atomic ) {
+ // *****************
+ if ( cc.get() == 0 ) {
+ shared_ptr< Cursor > cPtr = c;
+ cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) );
+ }
+
+ bool didYield;
+ if ( ! cc->yieldSometimes( ClientCursor::WillNeed, &didYield ) ) {
+ cc.release();
+ break;
+ }
+ if ( !c->ok() ) {
+ break;
+ }
+
+ if ( didYield ) {
+ d = nsdetails(ns);
+ nsdt = &NamespaceDetailsTransient::get(ns);
+ }
+ // *****************
+ }
+
+ if ( !c->currentMatches( &details ) ) {
+ c->advance();
+
+ if ( nscanned % 256 == 0 && ! atomic ) {
+ if ( cc.get() == 0 ) {
+ shared_ptr< Cursor > cPtr = c;
+ cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) );
+ }
+ if ( ! cc->yield() ) {
+ cc.release();
+ // TODO should we assert or something?
+ break;
+ }
+ if ( !c->ok() ) {
+ break;
+ }
+ d = nsdetails(ns);
+ nsdt = &NamespaceDetailsTransient::get(ns);
+ }
+ continue;
+ }
+
+ Record *r = c->_current();
+ DiskLoc loc = c->currLoc();
+
+ // TODO Maybe this is unnecessary since we have seenObjects
+ if ( c->getsetdup( loc ) && autoDedup ) {
+ c->advance();
+ continue;
+ }
+
+ BSONObj js(r);
+
+ BSONObj pattern = patternOrig;
+
+ if ( logop ) {
+ BSONObjBuilder idPattern;
+ BSONElement id;
+ // NOTE: If the matching object lacks an id, we'll log
+ // with the original pattern. This isn't replay-safe.
+ // It might make sense to suppress the log instead
+ // if there's no id.
+ if ( js.getObjectID( id ) ) {
+ idPattern.append( id );
+ pattern = idPattern.obj();
+ }
+ else {
+ uassert( 10157 , "multi-update requires all modified objects to have an _id" , ! multi );
+ }
+ }
+
+ if ( profile && !multi )
+ debug.nscanned = (int) nscanned;
+
+ /* look for $inc etc. note as listed here, all fields to inc must be this type, you can't set some
+ regular ones at the moment. */
+ if ( isOperatorUpdate ) {
+
+ if ( multi ) {
+ c->advance(); // go to next record in case this one moves
+ if ( autoDedup && seenObjects.count( loc ) )
+ continue;
+ }
+
+ const BSONObj& onDisk = loc.obj();
+
+ ModSet * useMods = mods.get();
+ bool forceRewrite = false;
+
+ auto_ptr<ModSet> mymodset;
+ if ( details._elemMatchKey && mods->hasDynamicArray() ) {
+ useMods = mods->fixDynamicArray( details._elemMatchKey );
+ mymodset.reset( useMods );
+ forceRewrite = true;
+ }
+
+ auto_ptr<ModSetState> mss = useMods->prepare( onDisk );
+
+ bool willAdvanceCursor = multi && c->ok() && ( modsIsIndexed || ! mss->canApplyInPlace() );
+
+ if ( willAdvanceCursor ) {
+ if ( cc.get() ) {
+ cc->setDoingDeletes( true );
+ }
+ c->prepareToTouchEarlierIterate();
+ }
+
+ if ( modsIsIndexed <= 0 && mss->canApplyInPlace() ) {
+ mss->applyModsInPlace( true );// const_cast<BSONObj&>(onDisk) );
+
+ DEBUGUPDATE( "\t\t\t doing in place update" );
+ if ( profile && !multi )
+ debug.fastmod = true;
+
+ if ( modsIsIndexed ) {
+ seenObjects.insert( loc );
+ }
+
+ d->paddingFits();
+ }
+ else {
+ if ( rs )
+ rs->goingToDelete( onDisk );
+
+ BSONObj newObj = mss->createNewFromMods();
+ checkTooLarge(newObj);
+ DiskLoc newLoc = theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug);
+ if ( newLoc != loc || modsIsIndexed ){
+ // log() << "Moved obj " << newLoc.obj()["_id"] << " from " << loc << " to " << newLoc << endl;
+ // object moved, need to make sure we don' get again
+ seenObjects.insert( newLoc );
+ }
+
+ }
+
+ if ( logop ) {
+ DEV assert( mods->size() );
+
+ if ( mss->haveArrayDepMod() ) {
+ BSONObjBuilder patternBuilder;
+ patternBuilder.appendElements( pattern );
+ mss->appendSizeSpecForArrayDepMods( patternBuilder );
+ pattern = patternBuilder.obj();
+ }
+
+ if ( forceRewrite || mss->needOpLogRewrite() ) {
+ DEBUGUPDATE( "\t rewrite update: " << mss->getOpLogRewrite() );
+ logOp("u", ns, mss->getOpLogRewrite() , &pattern );
+ }
+ else {
+ logOp("u", ns, updateobj, &pattern );
+ }
+ }
+ numModded++;
+ if ( ! multi )
+ return UpdateResult( 1 , 1 , numModded );
+ if ( willAdvanceCursor )
+ c->recoverFromTouchingEarlierIterate();
+
+ if ( nscanned % 64 == 0 && ! atomic ) {
+ if ( cc.get() == 0 ) {
+ shared_ptr< Cursor > cPtr = c;
+ cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) );
+ }
+ if ( ! cc->yield() ) {
+ cc.release();
+ break;
+ }
+ if ( !c->ok() ) {
+ break;
+ }
+ d = nsdetails(ns);
+ nsdt = &NamespaceDetailsTransient::get(ns);
+ }
+
+ getDur().commitIfNeeded();
+
+ continue;
+ }
+
+ uassert( 10158 , "multi update only works with $ operators" , ! multi );
+
+ BSONElementManipulator::lookForTimestamps( updateobj );
+ checkNoMods( updateobj );
+ theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug, god);
+ if ( logop ) {
+ DEV wassert( !god ); // god doesn't get logged, this would be bad.
+ logOp("u", ns, updateobj, &pattern );
+ }
+ return UpdateResult( 1 , 0 , 1 );
+ } while ( c->ok() );
+ } // endif
+
+ if ( numModded )
+ return UpdateResult( 1 , 1 , numModded );
+
+ // todo: no need for "if( profile )" here as that probably just makes things slower?
+ if ( profile )
+ debug.nscanned = (int) nscanned;
+
+ if ( upsert ) {
+ if ( updateobj.firstElementFieldName()[0] == '$' ) {
+ // upsert of an $operation. build a default object
+ BSONObj newObj = mods->createNewFromQuery( patternOrig );
+ checkNoMods( newObj );
+ debug.fastmodinsert = true;
+ theDataFileMgr.insertWithObjMod(ns, newObj, god);
+ if ( logop )
+ logOp( "i", ns, newObj );
+
+ return UpdateResult( 0 , 1 , 1 , newObj );
+ }
+ uassert( 10159 , "multi update only works with $ operators" , ! multi );
+ checkNoMods( updateobj );
+ debug.upsert = true;
+ BSONObj no = updateobj;
+ theDataFileMgr.insertWithObjMod(ns, no, god);
+ if ( logop )
+ logOp( "i", ns, no );
+ return UpdateResult( 0 , 0 , 1 , no );
+ }
+
+ return UpdateResult( 0 , isOperatorUpdate , 0 );
+ }
+
+ UpdateResult updateObjects(const char *ns, const BSONObj& updateobj, BSONObj patternOrig, bool upsert, bool multi, bool logop , OpDebug& debug ) {
+ uassert( 10155 , "cannot update reserved $ collection", strchr(ns, '$') == 0 );
+ if ( strstr(ns, ".system.") ) {
+ /* dm: it's very important that system.indexes is never updated as IndexDetails has pointers into it */
+ uassert( 10156 , str::stream() << "cannot update system collection: " << ns << " q: " << patternOrig << " u: " << updateobj , legalClientSystemNS( ns , true ) );
+ }
+ return _updateObjects(false, ns, updateobj, patternOrig, upsert, multi, logop, debug);
+ }
+
+}
diff --git a/src/mongo/db/ops/update.h b/src/mongo/db/ops/update.h
new file mode 100644
index 00000000000..9446db06d36
--- /dev/null
+++ b/src/mongo/db/ops/update.h
@@ -0,0 +1,700 @@
+// update.h
+
+/**
+ * Copyright (C) 2008 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../../pch.h"
+#include "../jsobj.h"
+#include "../../util/embedded_builder.h"
+#include "../matcher.h"
+
+namespace mongo {
+
+ // ---------- public -------------
+
+ struct UpdateResult {
+ const bool existing; // if existing objects were modified
+ const bool mod; // was this a $ mod
+ const long long num; // how many objects touched
+ OID upserted; // if something was upserted, the new _id of the object
+
+ UpdateResult( bool e, bool m, unsigned long long n , const BSONObj& upsertedObject = BSONObj() )
+ : existing(e) , mod(m), num(n) {
+ upserted.clear();
+ BSONElement id = upsertedObject["_id"];
+ if ( ! e && n == 1 && id.type() == jstOID ) {
+ upserted = id.OID();
+ }
+ }
+ };
+
+ class RemoveSaver;
+
+ /* returns true if an existing object was updated, false if no existing object was found.
+ multi - update multiple objects - mostly useful with things like $set
+ god - allow access to system namespaces
+ */
+ UpdateResult updateObjects(const char *ns, const BSONObj& updateobj, BSONObj pattern, bool upsert, bool multi , bool logop , OpDebug& debug );
+ UpdateResult _updateObjects(bool god, const char *ns, const BSONObj& updateobj, BSONObj pattern,
+ bool upsert, bool multi , bool logop , OpDebug& debug , RemoveSaver * rs = 0 );
+
+
+
+ // ---------- private -------------
+
+ class ModState;
+ class ModSetState;
+
+ /* Used for modifiers such as $inc, $set, $push, ...
+ * stores the info about a single operation
+ * once created should never be modified
+ */
+ struct Mod {
+ // See opFromStr below
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13
+ enum Op { INC, SET, PUSH, PUSH_ALL, PULL, PULL_ALL , POP, UNSET, BITAND, BITOR , BIT , ADDTOSET, RENAME_FROM, RENAME_TO } op;
+
+ static const char* modNames[];
+ static unsigned modNamesNum;
+
+ const char *fieldName;
+ const char *shortFieldName;
+
+ BSONElement elt; // x:5 note: this is the actual element from the updateobj
+ boost::shared_ptr<Matcher> matcher;
+ bool matcherOnPrimitive;
+
+ void init( Op o , BSONElement& e ) {
+ op = o;
+ elt = e;
+ if ( op == PULL && e.type() == Object ) {
+ BSONObj t = e.embeddedObject();
+ if ( t.firstElement().getGtLtOp() == 0 ) {
+ matcher.reset( new Matcher( t ) );
+ matcherOnPrimitive = false;
+ }
+ else {
+ matcher.reset( new Matcher( BSON( "" << t ) ) );
+ matcherOnPrimitive = true;
+ }
+ }
+ }
+
+ void setFieldName( const char * s ) {
+ fieldName = s;
+ shortFieldName = strrchr( fieldName , '.' );
+ if ( shortFieldName )
+ shortFieldName++;
+ else
+ shortFieldName = fieldName;
+ }
+
+ /**
+ * @param in incrememnts the actual value inside in
+ */
+ void incrementMe( BSONElement& in ) const {
+ BSONElementManipulator manip( in );
+ switch ( in.type() ) {
+ case NumberDouble:
+ manip.setNumber( elt.numberDouble() + in.numberDouble() );
+ break;
+ case NumberLong:
+ manip.setLong( elt.numberLong() + in.numberLong() );
+ break;
+ case NumberInt:
+ manip.setInt( elt.numberInt() + in.numberInt() );
+ break;
+ default:
+ assert(0);
+ }
+ }
+ void IncrementMe( BSONElement& in ) const {
+ BSONElementManipulator manip( in );
+ switch ( in.type() ) {
+ case NumberDouble:
+ manip.SetNumber( elt.numberDouble() + in.numberDouble() );
+ break;
+ case NumberLong:
+ manip.SetLong( elt.numberLong() + in.numberLong() );
+ break;
+ case NumberInt:
+ manip.SetInt( elt.numberInt() + in.numberInt() );
+ break;
+ default:
+ assert(0);
+ }
+ }
+
+ template< class Builder >
+ void appendIncremented( Builder& bb , const BSONElement& in, ModState& ms ) const;
+
+ bool operator<( const Mod &other ) const {
+ return strcmp( fieldName, other.fieldName ) < 0;
+ }
+
+ bool arrayDep() const {
+ switch (op) {
+ case PUSH:
+ case PUSH_ALL:
+ case POP:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ static bool isIndexed( const string& fullName , const set<string>& idxKeys ) {
+ const char * fieldName = fullName.c_str();
+ // check if there is an index key that is a parent of mod
+ for( const char *dot = strchr( fieldName, '.' ); dot; dot = strchr( dot + 1, '.' ) )
+ if ( idxKeys.count( string( fieldName, dot - fieldName ) ) )
+ return true;
+
+ // check if there is an index key equal to mod
+ if ( idxKeys.count(fullName) )
+ return true;
+ // check if there is an index key that is a child of mod
+ set< string >::const_iterator j = idxKeys.upper_bound( fullName );
+ if ( j != idxKeys.end() && j->find( fullName ) == 0 && (*j)[fullName.size()] == '.' )
+ return true;
+
+ return false;
+ }
+
+ bool isIndexed( const set<string>& idxKeys ) const {
+ string fullName = fieldName;
+
+ if ( isIndexed( fullName , idxKeys ) )
+ return true;
+
+ if ( strstr( fieldName , "." ) ) {
+ // check for a.0.1
+ StringBuilder buf( fullName.size() + 1 );
+ for ( size_t i=0; i<fullName.size(); i++ ) {
+ char c = fullName[i];
+
+ if ( c == '$' &&
+ i > 0 && fullName[i-1] == '.' &&
+ i+1<fullName.size() &&
+ fullName[i+1] == '.' ) {
+ i++;
+ continue;
+ }
+
+ buf << c;
+
+ if ( c != '.' )
+ continue;
+
+ if ( ! isdigit( fullName[i+1] ) )
+ continue;
+
+ bool possible = true;
+ size_t j=i+2;
+ for ( ; j<fullName.size(); j++ ) {
+ char d = fullName[j];
+ if ( d == '.' )
+ break;
+ if ( isdigit( d ) )
+ continue;
+ possible = false;
+ break;
+ }
+
+ if ( possible )
+ i = j;
+ }
+ string x = buf.str();
+ if ( isIndexed( x , idxKeys ) )
+ return true;
+ }
+
+ return false;
+ }
+
+ template< class Builder >
+ void apply( Builder& b , BSONElement in , ModState& ms ) const;
+
+ /**
+ * @return true iff toMatch should be removed from the array
+ */
+ bool _pullElementMatch( BSONElement& toMatch ) const;
+
+ void _checkForAppending( const BSONElement& e ) const {
+ if ( e.type() == Object ) {
+ // this is a tiny bit slow, but rare and important
+ // only when setting something TO an object, not setting something in an object
+ // and it checks for { $set : { x : { 'a.b' : 1 } } }
+ // which is feel has been common
+ uassert( 12527 , "not okForStorage" , e.embeddedObject().okForStorage() );
+ }
+ }
+
+ bool isEach() const {
+ if ( elt.type() != Object )
+ return false;
+ BSONElement e = elt.embeddedObject().firstElement();
+ if ( e.type() != Array )
+ return false;
+ return strcmp( e.fieldName() , "$each" ) == 0;
+ }
+
+ BSONObj getEach() const {
+ return elt.embeddedObjectUserCheck().firstElement().embeddedObjectUserCheck();
+ }
+
+ void parseEach( BSONElementSet& s ) const {
+ BSONObjIterator i(getEach());
+ while ( i.more() ) {
+ s.insert( i.next() );
+ }
+ }
+
+ const char *renameFrom() const {
+ massert( 13492, "mod must be RENAME_TO type", op == Mod::RENAME_TO );
+ return elt.fieldName();
+ }
+ };
+
+ /**
+ * stores a set of Mods
+ * once created, should never be changed
+ */
+ class ModSet : boost::noncopyable {
+ typedef map<string,Mod> ModHolder;
+ ModHolder _mods;
+ int _isIndexed;
+ bool _hasDynamicArray;
+
+ static void extractFields( map< string, BSONElement > &fields, const BSONElement &top, const string &base );
+
+ FieldCompareResult compare( const ModHolder::iterator &m, map< string, BSONElement >::iterator &p, const map< string, BSONElement >::iterator &pEnd ) const {
+ bool mDone = ( m == _mods.end() );
+ bool pDone = ( p == pEnd );
+ assert( ! mDone );
+ assert( ! pDone );
+ if ( mDone && pDone )
+ return SAME;
+ // If one iterator is done we want to read from the other one, so say the other one is lower.
+ if ( mDone )
+ return RIGHT_BEFORE;
+ if ( pDone )
+ return LEFT_BEFORE;
+
+ return compareDottedFieldNames( m->first, p->first.c_str() );
+ }
+
+ bool mayAddEmbedded( map< string, BSONElement > &existing, string right ) {
+ for( string left = EmbeddedBuilder::splitDot( right );
+ left.length() > 0 && left[ left.length() - 1 ] != '.';
+ left += "." + EmbeddedBuilder::splitDot( right ) ) {
+ if ( existing.count( left ) > 0 && existing[ left ].type() != Object )
+ return false;
+ if ( haveModForField( left.c_str() ) )
+ return false;
+ }
+ return true;
+ }
+ static Mod::Op opFromStr( const char *fn ) {
+ assert( fn[0] == '$' );
+ switch( fn[1] ) {
+ case 'i': {
+ if ( fn[2] == 'n' && fn[3] == 'c' && fn[4] == 0 )
+ return Mod::INC;
+ break;
+ }
+ case 's': {
+ if ( fn[2] == 'e' && fn[3] == 't' && fn[4] == 0 )
+ return Mod::SET;
+ break;
+ }
+ case 'p': {
+ if ( fn[2] == 'u' ) {
+ if ( fn[3] == 's' && fn[4] == 'h' ) {
+ if ( fn[5] == 0 )
+ return Mod::PUSH;
+ if ( fn[5] == 'A' && fn[6] == 'l' && fn[7] == 'l' && fn[8] == 0 )
+ return Mod::PUSH_ALL;
+ }
+ else if ( fn[3] == 'l' && fn[4] == 'l' ) {
+ if ( fn[5] == 0 )
+ return Mod::PULL;
+ if ( fn[5] == 'A' && fn[6] == 'l' && fn[7] == 'l' && fn[8] == 0 )
+ return Mod::PULL_ALL;
+ }
+ }
+ else if ( fn[2] == 'o' && fn[3] == 'p' && fn[4] == 0 )
+ return Mod::POP;
+ break;
+ }
+ case 'u': {
+ if ( fn[2] == 'n' && fn[3] == 's' && fn[4] == 'e' && fn[5] == 't' && fn[6] == 0 )
+ return Mod::UNSET;
+ break;
+ }
+ case 'b': {
+ if ( fn[2] == 'i' && fn[3] == 't' ) {
+ if ( fn[4] == 0 )
+ return Mod::BIT;
+ if ( fn[4] == 'a' && fn[5] == 'n' && fn[6] == 'd' && fn[7] == 0 )
+ return Mod::BITAND;
+ if ( fn[4] == 'o' && fn[5] == 'r' && fn[6] == 0 )
+ return Mod::BITOR;
+ }
+ break;
+ }
+ case 'a': {
+ if ( fn[2] == 'd' && fn[3] == 'd' ) {
+ // add
+ if ( fn[4] == 'T' && fn[5] == 'o' && fn[6] == 'S' && fn[7] == 'e' && fn[8] == 't' && fn[9] == 0 )
+ return Mod::ADDTOSET;
+
+ }
+ break;
+ }
+ case 'r': {
+ if ( fn[2] == 'e' && fn[3] == 'n' && fn[4] == 'a' && fn[5] == 'm' && fn[6] =='e' ) {
+ return Mod::RENAME_TO; // with this return code we handle both RENAME_TO and RENAME_FROM
+ }
+ break;
+ }
+ default: break;
+ }
+ uassert( 10161 , "Invalid modifier specified " + string( fn ), false );
+ return Mod::INC;
+ }
+
+ ModSet() {}
+
+ void updateIsIndexed( const Mod &m, const set<string> &idxKeys, const set<string> *backgroundKeys ) {
+ if ( m.isIndexed( idxKeys ) ||
+ (backgroundKeys && m.isIndexed(*backgroundKeys)) ) {
+ _isIndexed++;
+ }
+ }
+
+ public:
+
+ ModSet( const BSONObj &from ,
+ const set<string>& idxKeys = set<string>(),
+ const set<string>* backgroundKeys = 0
+ );
+
+ // TODO: this is inefficient - should probably just handle when iterating
+ ModSet * fixDynamicArray( const char * elemMatchKey ) const;
+
+ bool hasDynamicArray() const { return _hasDynamicArray; }
+
+ /**
+ * creates a ModSetState suitable for operation on obj
+ * doesn't change or modify this ModSet or any underying Mod
+ */
+ auto_ptr<ModSetState> prepare( const BSONObj& obj ) const;
+
+ /**
+ * given a query pattern, builds an object suitable for an upsert
+ * will take the query spec and combine all $ operators
+ */
+ BSONObj createNewFromQuery( const BSONObj& query );
+
+ /**
+ *
+ */
+ int isIndexed() const {
+ return _isIndexed;
+ }
+
+ unsigned size() const { return _mods.size(); }
+
+ bool haveModForField( const char *fieldName ) const {
+ return _mods.find( fieldName ) != _mods.end();
+ }
+
+ bool haveConflictingMod( const string& fieldName ) {
+ size_t idx = fieldName.find( '.' );
+ if ( idx == string::npos )
+ idx = fieldName.size();
+
+ ModHolder::const_iterator start = _mods.lower_bound(fieldName.substr(0,idx));
+ for ( ; start != _mods.end(); start++ ) {
+ FieldCompareResult r = compareDottedFieldNames( fieldName , start->first );
+ switch ( r ) {
+ case LEFT_SUBFIELD: return true;
+ case LEFT_BEFORE: return false;
+ case SAME: return true;
+ case RIGHT_BEFORE: return false;
+ case RIGHT_SUBFIELD: return true;
+ }
+ }
+ return false;
+
+
+ }
+
+ };
+
+ /**
+ * stores any information about a single Mod operating on a single Object
+ */
+ class ModState {
+ public:
+ const Mod * m;
+ BSONElement old;
+ BSONElement newVal;
+ BSONObj _objData;
+
+ const char * fixedOpName;
+ BSONElement * fixed;
+ int pushStartSize;
+
+ BSONType incType;
+ int incint;
+ double incdouble;
+ long long inclong;
+
+ bool dontApply;
+
+ ModState() {
+ fixedOpName = 0;
+ fixed = 0;
+ pushStartSize = -1;
+ incType = EOO;
+ dontApply = false;
+ }
+
+ Mod::Op op() const {
+ return m->op;
+ }
+
+ const char * fieldName() const {
+ return m->fieldName;
+ }
+
+ bool needOpLogRewrite() const {
+ if ( dontApply )
+ return false;
+
+ if ( fixed || fixedOpName || incType )
+ return true;
+
+ switch( op() ) {
+ case Mod::RENAME_FROM:
+ case Mod::RENAME_TO:
+ return true;
+ case Mod::BIT:
+ case Mod::BITAND:
+ case Mod::BITOR:
+ // TODO: should we convert this to $set?
+ return false;
+ default:
+ return false;
+ }
+ }
+
+ void appendForOpLog( BSONObjBuilder& b ) const;
+
+ template< class Builder >
+ void apply( Builder& b , BSONElement in ) {
+ m->apply( b , in , *this );
+ }
+
+ template< class Builder >
+ void appendIncValue( Builder& b , bool useFullName ) const {
+ const char * n = useFullName ? m->fieldName : m->shortFieldName;
+
+ switch ( incType ) {
+ case NumberDouble:
+ b.append( n , incdouble ); break;
+ case NumberLong:
+ b.append( n , inclong ); break;
+ case NumberInt:
+ b.append( n , incint ); break;
+ default:
+ assert(0);
+ }
+ }
+
+ string toString() const;
+
+ template< class Builder >
+ void handleRename( Builder &newObjBuilder, const char *shortFieldName );
+ };
+
+ /**
+ * this is used to hold state, meta data while applying a ModSet to a BSONObj
+ * the goal is to make ModSet const so its re-usable
+ */
+ class ModSetState : boost::noncopyable {
+ struct FieldCmp {
+ bool operator()( const string &l, const string &r ) const;
+ };
+ typedef map<string,ModState,FieldCmp> ModStateHolder;
+ const BSONObj& _obj;
+ ModStateHolder _mods;
+ bool _inPlacePossible;
+ BSONObj _newFromMods; // keep this data alive, as oplog generation may depend on it
+
+ ModSetState( const BSONObj& obj )
+ : _obj( obj ) , _inPlacePossible(true) {
+ }
+
+ /**
+ * @return if in place is still possible
+ */
+ bool amIInPlacePossible( bool inPlacePossible ) {
+ if ( ! inPlacePossible )
+ _inPlacePossible = false;
+ return _inPlacePossible;
+ }
+
+ template< class Builder >
+ void createNewFromMods( const string& root , Builder& b , const BSONObj &obj );
+
+ template< class Builder >
+ void _appendNewFromMods( const string& root , ModState& m , Builder& b , set<string>& onedownseen );
+
+ template< class Builder >
+ void appendNewFromMod( ModState& ms , Builder& b ) {
+ if ( ms.dontApply ) {
+ return;
+ }
+
+ //const Mod& m = *(ms.m); // HACK
+ Mod& m = *((Mod*)(ms.m)); // HACK
+
+ switch ( m.op ) {
+
+ case Mod::PUSH: {
+ if ( m.isEach() ) {
+ b.appendArray( m.shortFieldName, m.getEach() );
+ } else {
+ BSONObjBuilder arr( b.subarrayStart( m.shortFieldName ) );
+ arr.appendAs( m.elt, "0" );
+ arr.done();
+ }
+ break;
+ }
+ case Mod::ADDTOSET: {
+ if ( m.isEach() ) {
+ // Remove any duplicates in given array
+ BSONObjBuilder arr( b.subarrayStart( m.shortFieldName ) );
+ BSONElementSet toadd;
+ m.parseEach( toadd );
+ BSONObjIterator i( m.getEach() );
+ int n = 0;
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ if ( toadd.count(e) ) {
+ arr.appendAs( e , BSONObjBuilder::numStr( n++ ) );
+ toadd.erase( e );
+ }
+ }
+ arr.done();
+ }
+ else {
+ BSONObjBuilder arr( b.subarrayStart( m.shortFieldName ) );
+ arr.appendAs( m.elt, "0" );
+ arr.done();
+ }
+ break;
+ }
+
+ case Mod::PUSH_ALL: {
+ b.appendAs( m.elt, m.shortFieldName );
+ break;
+ }
+
+ case Mod::UNSET:
+ case Mod::PULL:
+ case Mod::PULL_ALL:
+ // no-op b/c unset/pull of nothing does nothing
+ break;
+
+ case Mod::INC:
+ ms.fixedOpName = "$set";
+ case Mod::SET: {
+ m._checkForAppending( m.elt );
+ b.appendAs( m.elt, m.shortFieldName );
+ break;
+ }
+ // shouldn't see RENAME_FROM here
+ case Mod::RENAME_TO:
+ ms.handleRename( b, m.shortFieldName );
+ break;
+ default:
+ stringstream ss;
+ ss << "unknown mod in appendNewFromMod: " << m.op;
+ throw UserException( 9015, ss.str() );
+ }
+
+ }
+
+ public:
+
+ bool canApplyInPlace() const {
+ return _inPlacePossible;
+ }
+
+ /**
+ * modified underlying _obj
+ * @param isOnDisk - true means this is an on disk object, and this update needs to be made durable
+ */
+ void applyModsInPlace( bool isOnDisk );
+
+ BSONObj createNewFromMods();
+
+ // re-writing for oplog
+
+ bool needOpLogRewrite() const {
+ for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ )
+ if ( i->second.needOpLogRewrite() )
+ return true;
+ return false;
+ }
+
+ BSONObj getOpLogRewrite() const {
+ BSONObjBuilder b;
+ for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ )
+ i->second.appendForOpLog( b );
+ return b.obj();
+ }
+
+ bool haveArrayDepMod() const {
+ for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ )
+ if ( i->second.m->arrayDep() )
+ return true;
+ return false;
+ }
+
+ void appendSizeSpecForArrayDepMods( BSONObjBuilder &b ) const {
+ for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ) {
+ const ModState& m = i->second;
+ if ( m.m->arrayDep() ) {
+ if ( m.pushStartSize == -1 )
+ b.appendNull( m.fieldName() );
+ else
+ b << m.fieldName() << BSON( "$size" << m.pushStartSize );
+ }
+ }
+ }
+
+ string toString() const;
+
+ friend class ModSet;
+ };
+
+}
+
diff --git a/src/mongo/db/pagefault.cpp b/src/mongo/db/pagefault.cpp
new file mode 100644
index 00000000000..4b9b1b23e02
--- /dev/null
+++ b/src/mongo/db/pagefault.cpp
@@ -0,0 +1,55 @@
+// @file pagefault.cpp
+
+#include "pch.h"
+#include "diskloc.h"
+#include "pagefault.h"
+#include "client.h"
+#include "pdfile.h"
+#include "server.h"
+
+namespace mongo {
+
+ PageFaultException::PageFaultException(Record *_r)
+ {
+ assert( cc()._pageFaultRetryableSection != 0 );
+ cc()._pageFaultRetryableSection->_laps++;
+ assert( cc()._pageFaultRetryableSection->_laps < 1000 );
+ r = _r;
+ era = LockMongoFilesShared::getEra();
+ }
+
+ void PageFaultException::touch() {
+ assert( !d.dbMutex.atLeastReadLocked() );
+ LockMongoFilesShared lk;
+ if( LockMongoFilesShared::getEra() != era ) {
+ // files opened and closed. we don't try to handle but just bail out; this is much simpler
+ // and less error prone and saves us from taking a dbmutex readlock.
+ dlog(2) << "era changed" << endl;
+ return;
+ }
+ r->touch();
+ }
+
+ PageFaultRetryableSection::~PageFaultRetryableSection() {
+ cc()._pageFaultRetryableSection = old;
+ }
+ PageFaultRetryableSection::PageFaultRetryableSection() {
+ _laps = 0;
+ old = cc()._pageFaultRetryableSection;
+ if( d.dbMutex.atLeastReadLocked() ) {
+ cc()._pageFaultRetryableSection = 0;
+ if( debug || logLevel > 2 ) {
+ LOGSOME << "info PageFaultRetryableSection will not yield, already locked upon reaching" << endl;
+ }
+ }
+ else if( cc()._pageFaultRetryableSection ) {
+ cc()._pageFaultRetryableSection = 0;
+ dlog(2) << "info nested PageFaultRetryableSection will not yield on fault" << endl;
+ }
+ else {
+ cc()._pageFaultRetryableSection = this;
+ cc()._hasWrittenThisPass = false;
+ }
+ }
+
+}
diff --git a/src/mongo/db/pagefault.h b/src/mongo/db/pagefault.h
new file mode 100644
index 00000000000..8bbf4ecab52
--- /dev/null
+++ b/src/mongo/db/pagefault.h
@@ -0,0 +1,46 @@
+// @file pagefault.h
+
+// define this : _PAGEFAULTEXCEPTION
+
+#pragma once
+
+namespace mongo {
+
+ class Record;
+
+ class PageFaultException /*: public DBException*/ {
+ unsigned era;
+ Record *r;
+ public:
+ PageFaultException(const PageFaultException& rhs) : era(rhs.era), r(rhs.r) { }
+ explicit PageFaultException(Record*);
+ void touch();
+ };
+
+ class PageFaultRetryableSection : boost::noncopyable {
+ PageFaultRetryableSection *old;
+ public:
+ unsigned _laps;
+ PageFaultRetryableSection();
+ ~PageFaultRetryableSection();
+ };
+#if 0
+ inline void how_to_use_example() {
+ // ...
+ {
+ PageFaultRetryableSection s;
+ while( 1 ) {
+ try {
+ writelock lk; // or readlock
+ // do work
+ break;
+ }
+ catch( PageFaultException& e ) {
+ e.touch();
+ }
+ }
+ }
+ // ...
+ }
+#endif
+}
diff --git a/src/mongo/db/pcre.txt b/src/mongo/db/pcre.txt
new file mode 100644
index 00000000000..3e21047eabc
--- /dev/null
+++ b/src/mongo/db/pcre.txt
@@ -0,0 +1,15 @@
+
+
+You need to install pcre.
+
+This could be scripted:
+
+cd /tmp
+curl -O ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-7.4.tar.gz
+tar -xzf pcre-7.4.tar.gz
+./configure --enable-utf8 --with-match-limit=200000 --with-match-limit-recursion=4000
+make
+make install
+
+
+At that point is will be installed in /usr/*. the version in p/pcre-7.4 is for VC++.
diff --git a/src/mongo/db/pdfile.cpp b/src/mongo/db/pdfile.cpp
new file mode 100644
index 00000000000..069eeadec37
--- /dev/null
+++ b/src/mongo/db/pdfile.cpp
@@ -0,0 +1,2425 @@
+// pdfile.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+todo:
+_ table scans must be sequential, not next/prev pointers
+_ coalesce deleted
+_ disallow system* manipulations from the database.
+*/
+
+#include "pch.h"
+#include "pdfile.h"
+#include "db.h"
+#include "../util/mmap.h"
+#include "../util/hashtab.h"
+#include "../util/file_allocator.h"
+#include "../util/processinfo.h"
+#include "../util/file.h"
+#include "btree.h"
+#include "btreebuilder.h"
+#include <algorithm>
+#include <list>
+#include "repl.h"
+#include "dbhelpers.h"
+#include "namespace-inl.h"
+#include "queryutil.h"
+#include "extsort.h"
+#include "curop-inl.h"
+#include "background.h"
+#include "compact.h"
+#include "ops/delete.h"
+#include "instance.h"
+#include "replutil.h"
+
+namespace mongo {
+
+ BOOST_STATIC_ASSERT( sizeof(Extent)-4 == 48+128 );
+ BOOST_STATIC_ASSERT( sizeof(DataFileHeader)-4 == 8192 );
+
+ void printMemInfo( const char * where ) {
+ cout << "mem info: ";
+ if ( where )
+ cout << where << " ";
+ ProcessInfo pi;
+ if ( ! pi.supported() ) {
+ cout << " not supported" << endl;
+ return;
+ }
+
+ cout << "vsize: " << pi.getVirtualMemorySize() << " resident: " << pi.getResidentSize() << " mapped: " << ( MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ) ) << endl;
+ }
+
+ bool isValidNS( const StringData& ns ) {
+ // TODO: should check for invalid characters
+
+ const char * x = strchr( ns.data() , '.' );
+ if ( ! x )
+ return false;
+
+ x++;
+ return *x > 0;
+ }
+
+ bool inDBRepair = false;
+ struct doingRepair {
+ doingRepair() {
+ assert( ! inDBRepair );
+ inDBRepair = true;
+ }
+ ~doingRepair() {
+ inDBRepair = false;
+ }
+ };
+
+ map<string, unsigned> BackgroundOperation::dbsInProg;
+ set<string> BackgroundOperation::nsInProg;
+
+ bool BackgroundOperation::inProgForDb(const char *db) {
+ assertInWriteLock();
+ return dbsInProg[db] != 0;
+ }
+
+ bool BackgroundOperation::inProgForNs(const char *ns) {
+ assertInWriteLock();
+ return nsInProg.count(ns) != 0;
+ }
+
+ void BackgroundOperation::assertNoBgOpInProgForDb(const char *db) {
+ uassert(12586, "cannot perform operation: a background operation is currently running for this database",
+ !inProgForDb(db));
+ }
+
+ void BackgroundOperation::assertNoBgOpInProgForNs(const char *ns) {
+ uassert(12587, "cannot perform operation: a background operation is currently running for this collection",
+ !inProgForNs(ns));
+ }
+
+ BackgroundOperation::BackgroundOperation(const char *ns) : _ns(ns) {
+ assertInWriteLock();
+ dbsInProg[_ns.db]++;
+ assert( nsInProg.count(_ns.ns()) == 0 );
+ nsInProg.insert(_ns.ns());
+ }
+
+ BackgroundOperation::~BackgroundOperation() {
+ wassert( d.dbMutex.isWriteLocked() );
+ dbsInProg[_ns.db]--;
+ nsInProg.erase(_ns.ns());
+ }
+
+ void BackgroundOperation::dump(stringstream& ss) {
+ if( nsInProg.size() ) {
+ ss << "\n<b>Background Jobs in Progress</b>\n";
+ for( set<string>::iterator i = nsInProg.begin(); i != nsInProg.end(); i++ )
+ ss << " " << *i << '\n';
+ }
+ for( map<string,unsigned>::iterator i = dbsInProg.begin(); i != dbsInProg.end(); i++ ) {
+ if( i->second )
+ ss << "database " << i->first << ": " << i->second << '\n';
+ }
+ }
+
+ /* ----------------------------------------- */
+
+ string dbpath = "/data/db/";
+ const char FREELIST_NS[] = ".$freelist";
+ bool directoryperdb = false;
+ string repairpath;
+ string pidfilepath;
+
+ DataFileMgr theDataFileMgr;
+ DatabaseHolder _dbHolder;
+ int MAGIC = 0x1000;
+
+ DatabaseHolder& dbHolderUnchecked() {
+ return _dbHolder;
+ }
+
+ void addNewNamespaceToCatalog(const char *ns, const BSONObj *options = 0);
+ void ensureIdIndexForNewNs(const char *ns) {
+ if ( ( strstr( ns, ".system." ) == 0 || legalClientSystemNS( ns , false ) ) &&
+ strstr( ns, FREELIST_NS ) == 0 ) {
+ log( 1 ) << "adding _id index for collection " << ns << endl;
+ ensureHaveIdIndex( ns );
+ }
+ }
+
+ string getDbContext() {
+ stringstream ss;
+ Client * c = currentClient.get();
+ if ( c ) {
+ Client::Context * cx = c->getContext();
+ if ( cx ) {
+ Database *database = cx->db();
+ if ( database ) {
+ ss << database->name << ' ';
+ ss << cx->ns() << ' ';
+ }
+ }
+ }
+ return ss.str();
+ }
+
+ /*---------------------------------------------------------------------*/
+
+ // inheritable class to implement an operation that may be applied to all
+ // files in a database using _applyOpToDataFiles()
+ class FileOp {
+ public:
+ virtual ~FileOp() {}
+ // Return true if file exists and operation successful
+ virtual bool apply( const boost::filesystem::path &p ) = 0;
+ virtual const char * op() const = 0;
+ };
+
+ void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator = false, const string& path = dbpath );
+
+ void _deleteDataFiles(const char *database) {
+ if ( directoryperdb ) {
+ FileAllocator::get()->waitUntilFinished();
+ MONGO_BOOST_CHECK_EXCEPTION_WITH_MSG( boost::filesystem::remove_all( boost::filesystem::path( dbpath ) / database ), "delete data files with a directoryperdb" );
+ return;
+ }
+ class : public FileOp {
+ virtual bool apply( const boost::filesystem::path &p ) {
+ return boost::filesystem::remove( p );
+ }
+ virtual const char * op() const {
+ return "remove";
+ }
+ } deleter;
+ _applyOpToDataFiles( database, deleter, true );
+ }
+
+ int Extent::initialSize(int len) {
+ long long sz = len * 16;
+ if ( len < 1000 ) sz = len * 64;
+ if ( sz > 1000000000 )
+ sz = 1000000000;
+ int z = ((int)sz) & 0xffffff00;
+ assert( z > len );
+ return z;
+ }
+
+ bool _userCreateNS(const char *ns, const BSONObj& options, string& err, bool *deferIdIndex) {
+ if ( nsdetails(ns) ) {
+ err = "collection already exists";
+ return false;
+ }
+
+ log(1) << "create collection " << ns << ' ' << options << endl;
+
+ /* todo: do this only when we have allocated space successfully? or we could insert with a { ok: 0 } field
+ and then go back and set to ok : 1 after we are done.
+ */
+ bool isFreeList = strstr(ns, FREELIST_NS) != 0;
+ if( !isFreeList )
+ addNewNamespaceToCatalog(ns, options.isEmpty() ? 0 : &options);
+
+ long long size = Extent::initialSize(128);
+ {
+ BSONElement e = options.getField("size");
+ if ( e.isNumber() ) {
+ size = e.numberLong();
+ size += 256;
+ size &= 0xffffffffffffff00LL;
+ }
+ }
+
+ uassert( 10083 , "create collection invalid size spec", size > 0 );
+
+ bool newCapped = false;
+ int mx = 0;
+ if( options["capped"].trueValue() ) {
+ newCapped = true;
+ BSONElement e = options.getField("max");
+ if ( e.isNumber() ) {
+ mx = e.numberInt();
+ }
+ }
+
+ // $nExtents just for debug/testing.
+ BSONElement e = options.getField( "$nExtents" );
+ Database *database = cc().database();
+ if ( e.type() == Array ) {
+ // We create one extent per array entry, with size specified
+ // by the array value.
+ BSONObjIterator i( e.embeddedObject() );
+ while( i.more() ) {
+ BSONElement e = i.next();
+ int size = int( e.number() );
+ assert( size <= 0x7fffffff );
+ // $nExtents is just for testing - always allocate new extents
+ // rather than reuse existing extents so we have some predictibility
+ // in the extent size used by our tests
+ database->suitableFile( ns, (int) size, false, false )->createExtent( ns, (int) size, newCapped );
+ }
+ }
+ else if ( int( e.number() ) > 0 ) {
+ // We create '$nExtents' extents, each of size 'size'.
+ int nExtents = int( e.number() );
+ assert( size <= 0x7fffffff );
+ for ( int i = 0; i < nExtents; ++i ) {
+ assert( size <= 0x7fffffff );
+ // $nExtents is just for testing - always allocate new extents
+ // rather than reuse existing extents so we have some predictibility
+ // in the extent size used by our tests
+ database->suitableFile( ns, (int) size, false, false )->createExtent( ns, (int) size, newCapped );
+ }
+ }
+ else {
+ // This is the non test case, where we don't have a $nExtents spec.
+ while ( size > 0 ) {
+ int max = MongoDataFile::maxSize() - DataFileHeader::HeaderSize;
+ int desiredExtentSize = (int) (size > max ? max : size);
+ if ( desiredExtentSize < Extent::minSize() ) {
+ desiredExtentSize = Extent::minSize();
+ }
+ desiredExtentSize &= 0xffffff00;
+ Extent *e = database->allocExtent( ns, desiredExtentSize, newCapped, true );
+ size -= e->length;
+ }
+ }
+
+ NamespaceDetails *d = nsdetails(ns);
+ assert(d);
+
+ bool ensure = false;
+ if ( options.getField( "autoIndexId" ).type() ) {
+ if ( options["autoIndexId"].trueValue() ) {
+ ensure = true;
+ }
+ }
+ else {
+ if ( !newCapped ) {
+ ensure=true;
+ }
+ }
+ if( ensure ) {
+ if( deferIdIndex )
+ *deferIdIndex = true;
+ else
+ ensureIdIndexForNewNs( ns );
+ }
+
+ if ( mx > 0 )
+ getDur().writingInt( d->max ) = mx;
+
+ return true;
+ }
+
+ /** { ..., capped: true, size: ..., max: ... }
+ @param deferIdIndex - if not not, defers id index creation. sets the bool value to true if we wanted to create the id index.
+ @return true if successful
+ */
+ bool userCreateNS(const char *ns, BSONObj options, string& err, bool logForReplication, bool *deferIdIndex) {
+ const char *coll = strchr( ns, '.' ) + 1;
+ massert( 10356 , str::stream() << "invalid ns: " << ns , NamespaceString::validCollectionName(ns));
+ char cl[ 256 ];
+ nsToDatabase( ns, cl );
+ bool ok = _userCreateNS(ns, options, err, deferIdIndex);
+ if ( logForReplication && ok ) {
+ if ( options.getField( "create" ).eoo() ) {
+ BSONObjBuilder b;
+ b << "create" << coll;
+ b.appendElements( options );
+ options = b.obj();
+ }
+ string logNs = string( cl ) + ".$cmd";
+ logOp("c", logNs.c_str(), options);
+ }
+ return ok;
+ }
+
+ /*---------------------------------------------------------------------*/
+
+ int MongoDataFile::maxSize() {
+ if ( sizeof( int* ) == 4 ) {
+ return 512 * 1024 * 1024;
+ }
+ else if ( cmdLine.smallfiles ) {
+ return 0x7ff00000 >> 2;
+ }
+ else {
+ return 0x7ff00000;
+ }
+ }
+
+ NOINLINE_DECL void MongoDataFile::badOfs2(int ofs) const {
+ stringstream ss;
+ ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database";
+ uasserted(13441, ss.str());
+ }
+
+ NOINLINE_DECL void MongoDataFile::badOfs(int ofs) const {
+ stringstream ss;
+ ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database";
+ uasserted(13440, ss.str());
+ }
+
+ int MongoDataFile::defaultSize( const char *filename ) const {
+ int size;
+ if ( fileNo <= 4 )
+ size = (64*1024*1024) << fileNo;
+ else
+ size = 0x7ff00000;
+ if ( cmdLine.smallfiles ) {
+ size = size >> 2;
+ }
+ return size;
+ }
+
+ static void check(void *_mb) {
+ if( sizeof(char *) == 4 )
+ uassert( 10084 , "can't map file memory - mongo requires 64 bit build for larger datasets", _mb != 0);
+ else
+ uassert( 10085 , "can't map file memory", _mb != 0);
+ }
+
+ /** @return true if found and opened. if uninitialized (prealloc only) does not open. */
+ bool MongoDataFile::openExisting( const char *filename ) {
+ assert( _mb == 0 );
+ if( !exists(filename) )
+ return false;
+ if( !mmf.open(filename,false) ) {
+ dlog(2) << "info couldn't open " << filename << " probably end of datafile list" << endl;
+ return false;
+ }
+ _mb = mmf.getView(); assert(_mb);
+ unsigned long long sz = mmf.length();
+ assert( sz <= 0x7fffffff );
+ assert( sz % 4096 == 0 );
+ if( sz < 64*1024*1024 && !cmdLine.smallfiles ) {
+ if( sz >= 16*1024*1024 && sz % (1024*1024) == 0 ) {
+ log() << "info openExisting file size " << sz << " but cmdLine.smallfiles=false" << endl;
+ }
+ else {
+ log() << "openExisting size " << sz << " less then minimum file size expectation " << filename << endl;
+ assert(false);
+ }
+ }
+ check(_mb);
+ if( header()->uninitialized() )
+ return false;
+ return true;
+ }
+
+ void MongoDataFile::open( const char *filename, int minSize, bool preallocateOnly ) {
+ long size = defaultSize( filename );
+ while ( size < minSize ) {
+ if ( size < maxSize() / 2 )
+ size *= 2;
+ else {
+ size = maxSize();
+ break;
+ }
+ }
+ if ( size > maxSize() )
+ size = maxSize();
+
+ assert( size >= 64*1024*1024 || cmdLine.smallfiles );
+ assert( size % 4096 == 0 );
+
+ if ( preallocateOnly ) {
+ if ( cmdLine.prealloc ) {
+ FileAllocator::get()->requestAllocation( filename, size );
+ }
+ return;
+ }
+
+ {
+ assert( _mb == 0 );
+ unsigned long long sz = size;
+ if( mmf.create(filename, sz, false) )
+ _mb = mmf.getView();
+ assert( sz <= 0x7fffffff );
+ size = (int) sz;
+ }
+ check(_mb);
+ header()->init(fileNo, size, filename);
+ }
+
+ void MongoDataFile::flush( bool sync ) {
+ mmf.flush( sync );
+ }
+
+ void addNewExtentToNamespace(const char *ns, Extent *e, DiskLoc eloc, DiskLoc emptyLoc, bool capped) {
+ NamespaceIndex *ni = nsindex(ns);
+ NamespaceDetails *details = ni->details(ns);
+ if ( details ) {
+ assert( !details->lastExtent.isNull() );
+ assert( !details->firstExtent.isNull() );
+ getDur().writingDiskLoc(e->xprev) = details->lastExtent;
+ getDur().writingDiskLoc(details->lastExtent.ext()->xnext) = eloc;
+ assert( !eloc.isNull() );
+ getDur().writingDiskLoc(details->lastExtent) = eloc;
+ }
+ else {
+ ni->add_ns(ns, eloc, capped);
+ details = ni->details(ns);
+ }
+
+ {
+ NamespaceDetails *dw = details->writingWithoutExtra();
+ dw->lastExtentSize = e->length;
+ }
+ details->addDeletedRec(emptyLoc.drec(), emptyLoc);
+ }
+
+ Extent* MongoDataFile::createExtent(const char *ns, int approxSize, bool newCapped, int loops) {
+ {
+ // make sizes align with VM page size
+ int newSize = (approxSize + 0xfff) & 0xfffff000;
+ assert( newSize >= 0 );
+ if( newSize < Extent::maxSize() )
+ approxSize = newSize;
+ }
+ massert( 10357 , "shutdown in progress", ! inShutdown() );
+ massert( 10358 , "bad new extent size", approxSize >= Extent::minSize() && approxSize <= Extent::maxSize() );
+ massert( 10359 , "header==0 on new extent: 32 bit mmap space exceeded?", header() ); // null if file open failed
+ int ExtentSize = min(header()->unusedLength, approxSize);
+ DiskLoc loc;
+ if ( ExtentSize < Extent::minSize() ) {
+ /* note there could be a lot of looping here is db just started and
+ no files are open yet. we might want to do something about that. */
+ if ( loops > 8 ) {
+ assert( loops < 10000 );
+ out() << "warning: loops=" << loops << " fileno:" << fileNo << ' ' << ns << '\n';
+ }
+ log() << "newExtent: " << ns << " file " << fileNo << " full, adding a new file\n";
+ return cc().database()->addAFile( 0, true )->createExtent(ns, approxSize, newCapped, loops+1);
+ }
+ int offset = header()->unused.getOfs();
+
+ DataFileHeader *h = header();
+ h->unused.writing().set( fileNo, offset + ExtentSize );
+ getDur().writingInt(h->unusedLength) = h->unusedLength - ExtentSize;
+ loc.set(fileNo, offset);
+ Extent *e = _getExtent(loc);
+ DiskLoc emptyLoc = getDur().writing(e)->init(ns, ExtentSize, fileNo, offset, newCapped);
+
+ addNewExtentToNamespace(ns, e, loc, emptyLoc, newCapped);
+
+ DEV tlog(1) << "new extent " << ns << " size: 0x" << hex << ExtentSize << " loc: 0x" << hex << offset
+ << " emptyLoc:" << hex << emptyLoc.getOfs() << dec << endl;
+ return e;
+ }
+
+ Extent* DataFileMgr::allocFromFreeList(const char *ns, int approxSize, bool capped) {
+ string s = cc().database()->name + FREELIST_NS;
+ NamespaceDetails *f = nsdetails(s.c_str());
+ if( f ) {
+ int low, high;
+ if( capped ) {
+ // be strict about the size
+ low = approxSize;
+ if( low > 2048 ) low -= 256;
+ high = (int) (approxSize * 1.05) + 256;
+ }
+ else {
+ low = (int) (approxSize * 0.8);
+ high = (int) (approxSize * 1.4);
+ }
+ if( high <= 0 ) {
+ // overflowed
+ high = max(approxSize, Extent::maxSize());
+ }
+ int n = 0;
+ Extent *best = 0;
+ int bestDiff = 0x7fffffff;
+ {
+ Timer t;
+ DiskLoc L = f->firstExtent;
+ while( !L.isNull() ) {
+ Extent * e = L.ext();
+ if( e->length >= low && e->length <= high ) {
+ int diff = abs(e->length - approxSize);
+ if( diff < bestDiff ) {
+ bestDiff = diff;
+ best = e;
+ if( ((double) diff) / approxSize < 0.1 ) {
+ // close enough
+ break;
+ }
+ if( t.seconds() >= 2 ) {
+ // have spent lots of time in write lock, and we are in [low,high], so close enough
+ // could come into play if extent freelist is very long
+ break;
+ }
+ }
+ else {
+ OCCASIONALLY {
+ if( high < 64 * 1024 && t.seconds() >= 2 ) {
+ // be less picky if it is taking a long time
+ high = 64 * 1024;
+ }
+ }
+ }
+ }
+ L = e->xnext;
+ ++n;
+ }
+ if( t.seconds() >= 10 ) {
+ log() << "warning: slow scan in allocFromFreeList (in write lock)" << endl;
+ }
+ }
+
+ if( n > 128 ) log( n < 512 ) << "warning: newExtent " << n << " scanned\n";
+
+ if( best ) {
+ Extent *e = best;
+ // remove from the free list
+ if( !e->xprev.isNull() )
+ e->xprev.ext()->xnext.writing() = e->xnext;
+ if( !e->xnext.isNull() )
+ e->xnext.ext()->xprev.writing() = e->xprev;
+ if( f->firstExtent == e->myLoc )
+ f->firstExtent.writing() = e->xnext;
+ if( f->lastExtent == e->myLoc )
+ f->lastExtent.writing() = e->xprev;
+
+ // use it
+ OCCASIONALLY if( n > 512 ) log() << "warning: newExtent " << n << " scanned\n";
+ DiskLoc emptyLoc = e->reuse(ns, capped);
+ addNewExtentToNamespace(ns, e, e->myLoc, emptyLoc, capped);
+ return e;
+ }
+ }
+
+ return 0;
+ // return createExtent(ns, approxSize, capped);
+ }
+
+ /*---------------------------------------------------------------------*/
+
+ void Extent::markEmpty() {
+ xnext.Null();
+ xprev.Null();
+ firstRecord.Null();
+ lastRecord.Null();
+ }
+
+ DiskLoc Extent::reuse(const char *nsname, bool capped) {
+ return getDur().writing(this)->_reuse(nsname, capped);
+ }
+
+ void getEmptyLoc(const char *ns, const DiskLoc extentLoc, int extentLength, bool capped, /*out*/DiskLoc& emptyLoc, /*out*/int& delRecLength) {
+ emptyLoc = extentLoc;
+ emptyLoc.inc( Extent::HeaderSize() );
+ delRecLength = extentLength - Extent::HeaderSize();
+ if( delRecLength >= 32*1024 && str::contains(ns, '$') && !capped ) {
+ // probably an index. so skip forward to keep its records page aligned
+ int& ofs = emptyLoc.GETOFS();
+ int newOfs = (ofs + 0xfff) & ~0xfff;
+ delRecLength -= (newOfs-ofs);
+ dassert( delRecLength > 0 );
+ ofs = newOfs;
+ }
+ }
+
+ DiskLoc Extent::_reuse(const char *nsname, bool capped) {
+ LOG(3) << "reset extent was:" << nsDiagnostic.toString() << " now:" << nsname << '\n';
+ massert( 10360 , "Extent::reset bad magic value", magic == 0x41424344 );
+ nsDiagnostic = nsname;
+ markEmpty();
+
+ DiskLoc emptyLoc;
+ int delRecLength;
+ getEmptyLoc(nsname, myLoc, length, capped, emptyLoc, delRecLength);
+
+ // todo: some dup code here and below in Extent::init
+ DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength);
+ empty = getDur().writing(empty);
+ empty->lengthWithHeaders = delRecLength;
+ empty->extentOfs = myLoc.getOfs();
+ empty->nextDeleted.Null();
+
+ return emptyLoc;
+ }
+
+ /* assumes already zeroed -- insufficient for block 'reuse' perhaps */
+ DiskLoc Extent::init(const char *nsname, int _length, int _fileNo, int _offset, bool capped) {
+ magic = 0x41424344;
+ myLoc.set(_fileNo, _offset);
+ xnext.Null();
+ xprev.Null();
+ nsDiagnostic = nsname;
+ length = _length;
+ firstRecord.Null();
+ lastRecord.Null();
+
+ DiskLoc emptyLoc;
+ int delRecLength;
+ getEmptyLoc(nsname, myLoc, _length, capped, emptyLoc, delRecLength);
+
+ DeletedRecord *empty = getDur().writing( DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength) );
+ empty->lengthWithHeaders = delRecLength;
+ empty->extentOfs = myLoc.getOfs();
+
+ return emptyLoc;
+ }
+
+ /*
+ Record* Extent::newRecord(int len) {
+ if( firstEmptyRegion.isNull() )8
+ return 0;
+
+ assert(len > 0);
+ int newRecSize = len + Record::HeaderSize;
+ DiskLoc newRecordLoc = firstEmptyRegion;
+ Record *r = getRecord(newRecordLoc);
+ int left = r->netLength() - len;
+ if( left < 0 ) {
+ //
+ firstEmptyRegion.Null();
+ return 0;
+ }
+
+ DiskLoc nextEmpty = r->next.getNextEmpty(firstEmptyRegion);
+ r->lengthWithHeaders = newRecSize;
+ r->next.markAsFirstOrLastInExtent(this); // we're now last in the extent
+ if( !lastRecord.isNull() ) {
+ assert(getRecord(lastRecord)->next.lastInExtent()); // it was the last one
+ getRecord(lastRecord)->next.set(newRecordLoc); // until now
+ r->prev.set(lastRecord);
+ }
+ else {
+ r->prev.markAsFirstOrLastInExtent(this); // we are the first in the extent
+ assert( firstRecord.isNull() );
+ firstRecord = newRecordLoc;
+ }
+ lastRecord = newRecordLoc;
+
+ if( left < Record::HeaderSize + 32 ) {
+ firstEmptyRegion.Null();
+ }
+ else {
+ firstEmptyRegion.inc(newRecSize);
+ Record *empty = getRecord(firstEmptyRegion);
+ empty->next.set(nextEmpty); // not for empty records, unless in-use records, next and prev can be null.
+ empty->prev.Null();
+ empty->lengthWithHeaders = left;
+ }
+
+ return r;
+ }
+ */
+
+ int Extent::maxSize() {
+ int maxExtentSize = 0x7ff00000;
+ if ( cmdLine.smallfiles ) {
+ maxExtentSize >>= 2;
+ }
+ return maxExtentSize;
+ }
+
+ /*---------------------------------------------------------------------*/
+
+ shared_ptr<Cursor> DataFileMgr::findAll(const char *ns, const DiskLoc &startLoc) {
+ NamespaceDetails * d = nsdetails( ns );
+ if ( ! d )
+ return shared_ptr<Cursor>(new BasicCursor(DiskLoc()));
+
+ DiskLoc loc = d->firstExtent;
+ Extent *e = getExtent(loc);
+
+ DEBUGGING {
+ out() << "listing extents for " << ns << endl;
+ DiskLoc tmp = loc;
+ set<DiskLoc> extents;
+
+ while ( 1 ) {
+ Extent *f = getExtent(tmp);
+ out() << "extent: " << tmp.toString() << endl;
+ extents.insert(tmp);
+ tmp = f->xnext;
+ if ( tmp.isNull() )
+ break;
+ f = f->getNextExtent();
+ }
+
+ out() << endl;
+ d->dumpDeleted(&extents);
+ }
+
+ if ( d->capped )
+ return shared_ptr<Cursor>( new ForwardCappedCursor( d , startLoc ) );
+
+ if ( !startLoc.isNull() )
+ return shared_ptr<Cursor>(new BasicCursor( startLoc ));
+
+ while ( e->firstRecord.isNull() && !e->xnext.isNull() ) {
+ /* todo: if extent is empty, free it for reuse elsewhere.
+ that is a bit complicated have to clean up the freelists.
+ */
+ RARELY out() << "info DFM::findAll(): extent " << loc.toString() << " was empty, skipping ahead. ns:" << ns << endl;
+ // find a nonempty extent
+ // it might be nice to free the whole extent here! but have to clean up free recs then.
+ e = e->getNextExtent();
+ }
+ return shared_ptr<Cursor>(new BasicCursor( e->firstRecord ));
+ }
+
+ /* get a table scan cursor, but can be forward or reverse direction.
+ order.$natural - if set, > 0 means forward (asc), < 0 backward (desc).
+ */
+ shared_ptr<Cursor> findTableScan(const char *ns, const BSONObj& order, const DiskLoc &startLoc) {
+ BSONElement el = order.getField("$natural"); // e.g., { $natural : -1 }
+
+ if ( el.number() >= 0 )
+ return DataFileMgr::findAll(ns, startLoc);
+
+ // "reverse natural order"
+ NamespaceDetails *d = nsdetails(ns);
+
+ if ( !d )
+ return shared_ptr<Cursor>(new BasicCursor(DiskLoc()));
+
+ if ( !d->capped ) {
+ if ( !startLoc.isNull() )
+ return shared_ptr<Cursor>(new ReverseCursor( startLoc ));
+ Extent *e = d->lastExtent.ext();
+ while ( e->lastRecord.isNull() && !e->xprev.isNull() ) {
+ OCCASIONALLY out() << " findTableScan: extent empty, skipping ahead" << endl;
+ e = e->getPrevExtent();
+ }
+ return shared_ptr<Cursor>(new ReverseCursor( e->lastRecord ));
+ }
+ else {
+ return shared_ptr<Cursor>( new ReverseCappedCursor( d, startLoc ) );
+ }
+ }
+
+ void printFreeList() {
+ string s = cc().database()->name + FREELIST_NS;
+ log() << "dump freelist " << s << endl;
+ NamespaceDetails *freeExtents = nsdetails(s.c_str());
+ if( freeExtents == 0 ) {
+ log() << " freeExtents==0" << endl;
+ return;
+ }
+ DiskLoc a = freeExtents->firstExtent;
+ while( !a.isNull() ) {
+ Extent *e = a.ext();
+ log() << " extent " << a.toString() << " len:" << e->length << " prev:" << e->xprev.toString() << endl;
+ a = e->xnext;
+ }
+
+ log() << "end freelist" << endl;
+ }
+
+ /** free a list of extents that are no longer in use. this is a double linked list of extents
+ (could be just one in the list)
+ */
+ void freeExtents(DiskLoc firstExt, DiskLoc lastExt) {
+ {
+ assert( !firstExt.isNull() && !lastExt.isNull() );
+ Extent *f = firstExt.ext();
+ Extent *l = lastExt.ext();
+ assert( f->xprev.isNull() );
+ assert( l->xnext.isNull() );
+ assert( f==l || !f->xnext.isNull() );
+ assert( f==l || !l->xprev.isNull() );
+ }
+
+ string s = cc().database()->name + FREELIST_NS;
+ NamespaceDetails *freeExtents = nsdetails(s.c_str());
+ if( freeExtents == 0 ) {
+ string err;
+ _userCreateNS(s.c_str(), BSONObj(), err, 0); // todo: this actually allocates an extent, which is bad!
+ freeExtents = nsdetails(s.c_str());
+ massert( 10361 , "can't create .$freelist", freeExtents);
+ }
+ if( freeExtents->firstExtent.isNull() ) {
+ freeExtents->firstExtent.writing() = firstExt;
+ freeExtents->lastExtent.writing() = lastExt;
+ }
+ else {
+ DiskLoc a = freeExtents->firstExtent;
+ assert( a.ext()->xprev.isNull() );
+ getDur().writingDiskLoc( a.ext()->xprev ) = lastExt;
+ getDur().writingDiskLoc( lastExt.ext()->xnext ) = a;
+ getDur().writingDiskLoc( freeExtents->firstExtent ) = firstExt;
+ }
+
+ //printFreeList();
+ }
+
+ /* drop a collection/namespace */
+ void dropNS(const string& nsToDrop) {
+ NamespaceDetails* d = nsdetails(nsToDrop.c_str());
+ uassert( 10086 , (string)"ns not found: " + nsToDrop , d );
+
+ BackgroundOperation::assertNoBgOpInProgForNs(nsToDrop.c_str());
+
+ NamespaceString s(nsToDrop);
+ assert( s.db == cc().database()->name );
+ if( s.isSystem() ) {
+ if( s.coll == "system.profile" )
+ uassert( 10087 , "turn off profiling before dropping system.profile collection", cc().database()->profile == 0 );
+ else
+ uasserted( 12502, "can't drop system ns" );
+ }
+
+ {
+ // remove from the system catalog
+ BSONObj cond = BSON( "name" << nsToDrop ); // { name: "colltodropname" }
+ string system_namespaces = cc().database()->name + ".system.namespaces";
+ /*int n = */ deleteObjects(system_namespaces.c_str(), cond, false, false, true);
+ // no check of return code as this ns won't exist for some of the new storage engines
+ }
+
+ // free extents
+ if( !d->firstExtent.isNull() ) {
+ freeExtents(d->firstExtent, d->lastExtent);
+ getDur().writingDiskLoc( d->firstExtent ).setInvalid();
+ getDur().writingDiskLoc( d->lastExtent ).setInvalid();
+ }
+
+ // remove from the catalog hashtable
+ cc().database()->namespaceIndex.kill_ns(nsToDrop.c_str());
+ }
+
+ void dropCollection( const string &name, string &errmsg, BSONObjBuilder &result ) {
+ log(1) << "dropCollection: " << name << endl;
+ NamespaceDetails *d = nsdetails(name.c_str());
+ if( d == 0 )
+ return;
+
+ BackgroundOperation::assertNoBgOpInProgForNs(name.c_str());
+
+ if ( d->nIndexes != 0 ) {
+ try {
+ assert( dropIndexes(d, name.c_str(), "*", errmsg, result, true) );
+ }
+ catch( DBException& e ) {
+ stringstream ss;
+ ss << "drop: dropIndexes for collection failed - consider trying repair ";
+ ss << " cause: " << e.what();
+ uasserted(12503,ss.str());
+ }
+ assert( d->nIndexes == 0 );
+ }
+ log(1) << "\t dropIndexes done" << endl;
+ result.append("ns", name.c_str());
+ ClientCursor::invalidate(name.c_str());
+ Top::global.collectionDropped( name );
+ NamespaceDetailsTransient::eraseForPrefix( name.c_str() );
+ dropNS(name);
+ }
+
+ /* unindex all keys in index for this record. */
+ static void _unindexRecord(IndexDetails& id, BSONObj& obj, const DiskLoc& dl, bool logMissing = true) {
+ BSONObjSet keys;
+ id.getKeysFromObject(obj, keys);
+ IndexInterface& ii = id.idxInterface();
+ for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) {
+ BSONObj j = *i;
+
+ bool ok = false;
+ try {
+ ok = ii.unindex(id.head, id, j, dl);
+ }
+ catch (AssertionException& e) {
+ problem() << "Assertion failure: _unindex failed " << id.indexNamespace() << endl;
+ out() << "Assertion failure: _unindex failed: " << e.what() << '\n';
+ out() << " obj:" << obj.toString() << '\n';
+ out() << " key:" << j.toString() << '\n';
+ out() << " dl:" << dl.toString() << endl;
+ sayDbContext();
+ }
+
+ if ( !ok && logMissing ) {
+ log() << "unindex failed (key too big?) " << id.indexNamespace() << " key: " << j << " " << obj["_id"] << endl;
+ }
+ }
+ }
+//zzz
+ /* unindex all keys in all indexes for this record. */
+ static void unindexRecord(NamespaceDetails *d, Record *todelete, const DiskLoc& dl, bool noWarn = false) {
+ BSONObj obj(todelete);
+ int n = d->nIndexes;
+ for ( int i = 0; i < n; i++ )
+ _unindexRecord(d->idx(i), obj, dl, !noWarn);
+ if( d->indexBuildInProgress ) { // background index
+ // always pass nowarn here, as this one may be missing for valid reasons as we are concurrently building it
+ _unindexRecord(d->idx(n), obj, dl, false);
+ }
+ }
+
+ /* deletes a record, just the pdfile portion -- no index cleanup, no cursor cleanup, etc.
+ caller must check if capped
+ */
+ void DataFileMgr::_deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl) {
+ /* remove ourself from the record next/prev chain */
+ {
+ if ( todelete->prevOfs != DiskLoc::NullOfs )
+ getDur().writingInt( todelete->getPrev(dl).rec()->nextOfs ) = todelete->nextOfs;
+ if ( todelete->nextOfs != DiskLoc::NullOfs )
+ getDur().writingInt( todelete->getNext(dl).rec()->prevOfs ) = todelete->prevOfs;
+ }
+
+ /* remove ourself from extent pointers */
+ {
+ Extent *e = getDur().writing( todelete->myExtent(dl) );
+ if ( e->firstRecord == dl ) {
+ if ( todelete->nextOfs == DiskLoc::NullOfs )
+ e->firstRecord.Null();
+ else
+ e->firstRecord.set(dl.a(), todelete->nextOfs);
+ }
+ if ( e->lastRecord == dl ) {
+ if ( todelete->prevOfs == DiskLoc::NullOfs )
+ e->lastRecord.Null();
+ else
+ e->lastRecord.set(dl.a(), todelete->prevOfs);
+ }
+ }
+
+ /* add to the free list */
+ {
+ {
+ NamespaceDetails::Stats *s = getDur().writing(&d->stats);
+ s->datasize -= todelete->netLength();
+ s->nrecords--;
+ }
+
+ if ( strstr(ns, ".system.indexes") ) {
+ /* temp: if in system.indexes, don't reuse, and zero out: we want to be
+ careful until validated more, as IndexDetails has pointers
+ to this disk location. so an incorrectly done remove would cause
+ a lot of problems.
+ */
+ memset(getDur().writingPtr(todelete, todelete->lengthWithHeaders), 0, todelete->lengthWithHeaders);
+ }
+ else {
+ DEV {
+ unsigned long long *p = (unsigned long long *) todelete->data;
+ *getDur().writing(p) = 0;
+ //DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse.
+ }
+ d->addDeletedRec((DeletedRecord*)todelete, dl);
+ }
+ }
+ }
+
+ void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK, bool noWarn, bool doLog ) {
+ dassert( todelete == dl.rec() );
+
+ NamespaceDetails* d = nsdetails(ns);
+ if ( d->capped && !cappedOK ) {
+ out() << "failing remove on a capped ns " << ns << endl;
+ uassert( 10089 , "can't remove from a capped collection" , 0 );
+ return;
+ }
+
+ BSONObj toDelete;
+ if ( doLog ) {
+ BSONElement e = dl.obj()["_id"];
+ if ( e.type() ) {
+ toDelete = e.wrap();
+ }
+ }
+
+ /* check if any cursors point to us. if so, advance them. */
+ ClientCursor::aboutToDelete(dl);
+
+ unindexRecord(d, todelete, dl, noWarn);
+
+ _deleteRecord(d, ns, todelete, dl);
+ NamespaceDetailsTransient::get( ns ).notifyOfWriteOp();
+
+ if ( ! toDelete.isEmpty() ) {
+ logOp( "d" , ns , toDelete );
+ }
+ }
+
+
+ /** Note: if the object shrinks a lot, we don't free up space, we leave extra at end of the record.
+ */
+ const DiskLoc DataFileMgr::updateRecord(
+ const char *ns,
+ NamespaceDetails *d,
+ NamespaceDetailsTransient *nsdt,
+ Record *toupdate, const DiskLoc& dl,
+ const char *_buf, int _len, OpDebug& debug, bool god) {
+
+ dassert( toupdate == dl.rec() );
+
+ BSONObj objOld(toupdate);
+ BSONObj objNew(_buf);
+ DEV assert( objNew.objsize() == _len );
+ DEV assert( objNew.objdata() == _buf );
+
+ if( !objNew.hasElement("_id") && objOld.hasElement("_id") ) {
+ /* add back the old _id value if the update removes it. Note this implementation is slow
+ (copies entire object multiple times), but this shouldn't happen often, so going for simple
+ code, not speed.
+ */
+ BSONObjBuilder b;
+ BSONElement e;
+ assert( objOld.getObjectID(e) );
+ b.append(e); // put _id first, for best performance
+ b.appendElements(objNew);
+ objNew = b.obj();
+ }
+
+ /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further
+ below. that is suboptimal, but it's pretty complicated to do it the other way without rollbacks...
+ */
+ vector<IndexChanges> changes;
+ bool changedId = false;
+ getIndexChanges(changes, *d, objNew, objOld, changedId);
+ uassert( 13596 , str::stream() << "cannot change _id of a document old:" << objOld << " new:" << objNew , ! changedId );
+ dupCheck(changes, *d, dl);
+
+ if ( toupdate->netLength() < objNew.objsize() ) {
+ // doesn't fit. reallocate -----------------------------------------------------
+ uassert( 10003 , "failing update: objects in a capped ns cannot grow", !(d && d->capped));
+ d->paddingTooSmall();
+ debug.moved = true;
+ deleteRecord(ns, toupdate, dl);
+ return insert(ns, objNew.objdata(), objNew.objsize(), god);
+ }
+
+ nsdt->notifyOfWriteOp();
+ d->paddingFits();
+
+ /* have any index keys changed? */
+ {
+ int keyUpdates = 0;
+ int z = d->nIndexesBeingBuilt();
+ for ( int x = 0; x < z; x++ ) {
+ IndexDetails& idx = d->idx(x);
+ IndexInterface& ii = idx.idxInterface();
+ for ( unsigned i = 0; i < changes[x].removed.size(); i++ ) {
+ try {
+ bool found = ii.unindex(idx.head, idx, *changes[x].removed[i], dl);
+ if ( ! found ) {
+ RARELY warning() << "ns: " << ns << " couldn't unindex key: " << *changes[x].removed[i]
+ << " for doc: " << objOld["_id"] << endl;
+ }
+ }
+ catch (AssertionException&) {
+ debug.extra << " exception update unindex ";
+ problem() << " caught assertion update unindex " << idx.indexNamespace() << endl;
+ }
+ }
+ assert( !dl.isNull() );
+ BSONObj idxKey = idx.info.obj().getObjectField("key");
+ Ordering ordering = Ordering::make(idxKey);
+ keyUpdates += changes[x].added.size();
+ for ( unsigned i = 0; i < changes[x].added.size(); i++ ) {
+ try {
+ /* we did the dupCheck() above. so we don't have to worry about it here. */
+ ii.bt_insert(
+ idx.head,
+ dl, *changes[x].added[i], ordering, /*dupsAllowed*/true, idx);
+ }
+ catch (AssertionException& e) {
+ debug.extra << " exception update index ";
+ problem() << " caught assertion update index " << idx.indexNamespace() << " " << e << " " << objNew["_id"] << endl;
+ }
+ }
+ }
+
+ debug.keyUpdates = keyUpdates;
+ }
+
+ // update in place
+ int sz = objNew.objsize();
+ memcpy(getDur().writingPtr(toupdate->data, sz), objNew.objdata(), sz);
+ return dl;
+ }
+
+ int Extent::followupSize(int len, int lastExtentLen) {
+ assert( len < Extent::maxSize() );
+ int x = initialSize(len);
+ // changed from 1.20 to 1.35 in v2.1.x to get to larger extent size faster
+ int y = (int) (lastExtentLen < 4000000 ? lastExtentLen * 4.0 : lastExtentLen * 1.35);
+ int sz = y > x ? y : x;
+
+ if ( sz < lastExtentLen ) {
+ // this means there was an int overflow
+ // so we should turn it into maxSize
+ sz = Extent::maxSize();
+ }
+ else if ( sz > Extent::maxSize() ) {
+ sz = Extent::maxSize();
+ }
+
+ sz = ((int)sz) & 0xffffff00;
+ assert( sz > len );
+
+ return sz;
+ }
+
+ /* step one of adding keys to index idxNo for a new record
+ @return true means done. false means multikey involved and more work to do
+ */
+ static void _addKeysToIndexStepOneOfTwo(BSONObjSet & /*out*/keys, NamespaceDetails *d, int idxNo, BSONObj& obj, DiskLoc recordLoc, IndexDetails& idx) {
+ idx.getKeysFromObject(obj, keys);
+ if( keys.empty() )
+ return;
+ bool dupsAllowed = !idx.unique();
+ BSONObj order = idx.keyPattern();
+ IndexInterface& ii = idx.idxInterface();
+ Ordering ordering = Ordering::make(order);
+
+ assert( !recordLoc.isNull() );
+
+ try {
+ // we can't do the two step method with multi keys as insertion of one key changes the indexes
+ // structure. however we can do the first key of the set so we go ahead and do that FWIW
+ ii.phasedQueueItemToInsert(idxNo, idx.head, recordLoc, *keys.begin(), ordering, idx, dupsAllowed);
+ }
+ catch (AssertionException& e) {
+ if( e.getCode() == 10287 && idxNo == d->nIndexes ) {
+ DEV log() << "info: caught key already in index on bg indexing (ok)" << endl;
+ }
+ else {
+ throw;
+ }
+ }
+ }
+
+ namespace dur {
+ extern unsigned notesThisLock;
+ }
+
+ void upgradeToWritable(bool shouldBeUnlocked) {
+ // todo upgrade!
+ DEV {
+ // verify we haven't written yet (usually)
+
+ // test binary does special things so this would assert there so don't check there
+ if( shouldBeUnlocked && !cmdLine.binaryName.empty() && cmdLine.binaryName != "test" ) {
+ static unsigned long long zeroes;
+ static unsigned long long tot;
+ tot++;
+ if( dur::notesThisLock == 0 )
+ zeroes++;
+ if( tot > 1000 ) {
+ static int n;
+ DEV if( n++ == 0 )
+ log() << "warning upgradeToWritable: already in writable too often" << endl;
+ }
+ }
+ }
+ }
+
+ /** add index keys for a newly inserted record
+ done in two steps/phases to defer write lock portion
+ */
+ static void indexRecordUsingTwoSteps(NamespaceDetails *d, BSONObj obj, DiskLoc loc, bool shouldBeUnlocked) {
+ vector<int> multi;
+ vector<BSONObjSet> multiKeys;
+
+ IndexInterface::phasedBegin();
+
+ int n = d->nIndexesBeingBuilt();
+ {
+ BSONObjSet keys;
+ for ( int i = 0; i < n; i++ ) {
+ IndexDetails& idx = d->idx(i);
+ // this call throws on unique constraint violation. we haven't done any writes yet so that is fine.
+ _addKeysToIndexStepOneOfTwo(/*out*/keys, d, i, obj, loc, idx);
+ if( keys.size() > 1 ) {
+ multi.push_back(i);
+ multiKeys.push_back(BSONObjSet());
+ multiKeys[multiKeys.size()-1].swap(keys);
+ }
+ keys.clear();
+ }
+ }
+
+ // update lock to writable here. TODO
+
+ upgradeToWritable(shouldBeUnlocked);
+
+ IndexInterface::phasedFinish(); // step 2
+
+ // now finish adding multikeys
+ for( unsigned j = 0; j < multi.size(); j++ ) {
+ unsigned i = multi[j];
+ BSONObjSet& keys = multiKeys[j];
+ IndexDetails& idx = d->idx(i);
+ IndexInterface& ii = idx.idxInterface();
+ Ordering ordering = Ordering::make(idx.keyPattern());
+ d->setIndexIsMultikey(i);
+ for( BSONObjSet::iterator k = ++keys.begin()/*skip 1*/; k != keys.end(); k++ ) {
+ try {
+ ii.bt_insert(idx.head, loc, *k, ordering, !idx.unique(), idx);
+ } catch (AssertionException& e) {
+ if( e.getCode() == 10287 && (int) i == d->nIndexes ) {
+ DEV log() << "info: caught key already in index on bg indexing (ok)" << endl;
+ }
+ else {
+ /* roll back previously added index entries
+ note must do self index as it is multikey and could require some cleanup itself
+ */
+ for( int j = 0; j < n; j++ ) {
+ try {
+ _unindexRecord(d->idx(j), obj, loc, false);
+ }
+ catch(...) {
+ log(3) << "unindex fails on rollback after unique key constraint prevented insert\n";
+ }
+ }
+ throw;
+ }
+ }
+ }
+ }
+ }
+
+ /* add keys to index idxNo for a new record */
+ static void addKeysToIndex(NamespaceDetails *d, int idxNo, BSONObj& obj, DiskLoc recordLoc, bool dupsAllowed) {
+ IndexDetails& idx = d->idx(idxNo);
+ BSONObjSet keys;
+ idx.getKeysFromObject(obj, keys);
+ if( keys.empty() )
+ return;
+ BSONObj order = idx.keyPattern();
+ IndexInterface& ii = idx.idxInterface();
+ Ordering ordering = Ordering::make(order);
+ int n = 0;
+ for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) {
+ if( ++n == 2 ) {
+ d->setIndexIsMultikey(idxNo);
+ }
+ assert( !recordLoc.isNull() );
+ try {
+ ii.bt_insert(idx.head, recordLoc, *i, ordering, dupsAllowed, idx);
+ }
+ catch (AssertionException& e) {
+ if( e.getCode() == 10287 && idxNo == d->nIndexes ) {
+ DEV log() << "info: caught key already in index on bg indexing (ok)" << endl;
+ continue;
+ }
+ if( !dupsAllowed ) {
+ // dup key exception, presumably.
+ throw;
+ }
+ problem() << " caught assertion addKeysToIndex " << idx.indexNamespace() << " " << obj["_id"] << endl;
+ }
+ }
+ }
+
+#if 0
+ void testSorting() {
+ BSONObjBuilder b;
+ b.appendNull("");
+ BSONObj x = b.obj();
+
+ BSONObjExternalSorter sorter(*IndexDetails::iis[1]);
+
+ sorter.add(x, DiskLoc(3,7));
+ sorter.add(x, DiskLoc(4,7));
+ sorter.add(x, DiskLoc(2,7));
+ sorter.add(x, DiskLoc(1,7));
+ sorter.add(x, DiskLoc(3,77));
+
+ sorter.sort();
+
+ auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
+ while( i->more() ) {
+ BSONObjExternalSorter::Data d = i->next();
+ /*cout << d.second.toString() << endl;
+ cout << d.first.objsize() << endl;
+ cout<<"SORTER next:" << d.first.toString() << endl;*/
+ }
+ }
+#endif
+
+ SortPhaseOne *precalced = 0;
+
+ template< class V >
+ void buildBottomUpPhases2And3(bool dupsAllowed, IndexDetails& idx, BSONObjExternalSorter& sorter,
+ bool dropDups, list<DiskLoc> &dupsToDrop, CurOp * op, SortPhaseOne *phase1, ProgressMeterHolder &pm,
+ Timer& t
+ )
+ {
+ BtreeBuilder<V> btBuilder(dupsAllowed, idx);
+ BSONObj keyLast;
+ auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
+ assert( pm == op->setMessage( "index: (2/3) btree bottom up" , phase1->nkeys , 10 ) );
+ while( i->more() ) {
+ RARELY killCurrentOp.checkForInterrupt();
+ BSONObjExternalSorter::Data d = i->next();
+
+ try {
+ if ( !dupsAllowed && dropDups ) {
+ LastError::Disabled led( lastError.get() );
+ btBuilder.addKey(d.first, d.second);
+ }
+ else {
+ btBuilder.addKey(d.first, d.second);
+ }
+ }
+ catch( AssertionException& e ) {
+ if ( dupsAllowed ) {
+ // unknow exception??
+ throw;
+ }
+
+ if( e.interrupted() ) {
+ killCurrentOp.checkForInterrupt();
+ }
+
+ if ( ! dropDups )
+ throw;
+
+ /* we could queue these on disk, but normally there are very few dups, so instead we
+ keep in ram and have a limit.
+ */
+ dupsToDrop.push_back(d.second);
+ uassert( 10092 , "too may dups on index build with dropDups=true", dupsToDrop.size() < 1000000 );
+ }
+ pm.hit();
+ }
+ pm.finished();
+ op->setMessage( "index: (3/3) btree-middle" );
+ log(t.seconds() > 10 ? 0 : 1 ) << "\t done building bottom layer, going to commit" << endl;
+ btBuilder.commit();
+ if ( btBuilder.getn() != phase1->nkeys && ! dropDups ) {
+ warning() << "not all entries were added to the index, probably some keys were too large" << endl;
+ }
+ }
+
+ // throws DBException
+ unsigned long long fastBuildIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
+ CurOp * op = cc().curop();
+
+ Timer t;
+
+ tlog(1) << "fastBuildIndex " << ns << " idxNo:" << idxNo << ' ' << idx.info.obj().toString() << endl;
+
+ bool dupsAllowed = !idx.unique();
+ bool dropDups = idx.dropDups() || inDBRepair;
+ BSONObj order = idx.keyPattern();
+
+ getDur().writingDiskLoc(idx.head).Null();
+
+ if ( logLevel > 1 ) printMemInfo( "before index start" );
+
+ /* get and sort all the keys ----- */
+ ProgressMeterHolder pm( op->setMessage( "index: (1/3) external sort" , d->stats.nrecords , 10 ) );
+ SortPhaseOne _ours;
+ SortPhaseOne *phase1 = precalced;
+ if( phase1 == 0 ) {
+ phase1 = &_ours;
+ SortPhaseOne& p1 = *phase1;
+ shared_ptr<Cursor> c = theDataFileMgr.findAll(ns);
+ p1.sorter.reset( new BSONObjExternalSorter(idx.idxInterface(), order) );
+ p1.sorter->hintNumObjects( d->stats.nrecords );
+ const IndexSpec& spec = idx.getSpec();
+ while ( c->ok() ) {
+ BSONObj o = c->current();
+ DiskLoc loc = c->currLoc();
+ p1.addKeys(spec, o, loc);
+ c->advance();
+ pm.hit();
+ if ( logLevel > 1 && p1.n % 10000 == 0 ) {
+ printMemInfo( "\t iterating objects" );
+ }
+ };
+ }
+ pm.finished();
+
+ BSONObjExternalSorter& sorter = *(phase1->sorter);
+
+ if( phase1->multi )
+ d->setIndexIsMultikey(idxNo);
+
+ if ( logLevel > 1 ) printMemInfo( "before final sort" );
+ phase1->sorter->sort();
+ if ( logLevel > 1 ) printMemInfo( "after final sort" );
+
+ log(t.seconds() > 5 ? 0 : 1) << "\t external sort used : " << sorter.numFiles() << " files " << " in " << t.seconds() << " secs" << endl;
+
+ list<DiskLoc> dupsToDrop;
+
+ /* build index --- */
+ if( idx.version() == 0 )
+ buildBottomUpPhases2And3<V0>(dupsAllowed, idx, sorter, dropDups, dupsToDrop, op, phase1, pm, t);
+ else if( idx.version() == 1 )
+ buildBottomUpPhases2And3<V1>(dupsAllowed, idx, sorter, dropDups, dupsToDrop, op, phase1, pm, t);
+ else
+ assert(false);
+
+ log(1) << "\t fastBuildIndex dupsToDrop:" << dupsToDrop.size() << endl;
+
+ for( list<DiskLoc>::iterator i = dupsToDrop.begin(); i != dupsToDrop.end(); i++ ){
+ theDataFileMgr.deleteRecord( ns, i->rec(), *i, false /* cappedOk */ , true /* noWarn */ , isMaster( ns ) /* logOp */ );
+ getDur().commitIfNeeded();
+ }
+
+ return phase1->n;
+ }
+
+ class BackgroundIndexBuildJob : public BackgroundOperation {
+
+ unsigned long long addExistingToIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
+ bool dupsAllowed = !idx.unique();
+ bool dropDups = idx.dropDups();
+
+ ProgressMeter& progress = cc().curop()->setMessage( "bg index build" , d->stats.nrecords );
+
+ unsigned long long n = 0;
+ auto_ptr<ClientCursor> cc;
+ {
+ shared_ptr<Cursor> c = theDataFileMgr.findAll(ns);
+ cc.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, ns) );
+ }
+ CursorId id = cc->cursorid();
+
+ while ( cc->ok() ) {
+ BSONObj js = cc->current();
+ try {
+ {
+ if ( !dupsAllowed && dropDups ) {
+ LastError::Disabled led( lastError.get() );
+ addKeysToIndex(d, idxNo, js, cc->currLoc(), dupsAllowed);
+ }
+ else {
+ addKeysToIndex(d, idxNo, js, cc->currLoc(), dupsAllowed);
+ }
+ }
+ cc->advance();
+ }
+ catch( AssertionException& e ) {
+ if( e.interrupted() ) {
+ killCurrentOp.checkForInterrupt();
+ }
+
+ if ( dropDups ) {
+ DiskLoc toDelete = cc->currLoc();
+ bool ok = cc->advance();
+ cc->updateLocation();
+ theDataFileMgr.deleteRecord( ns, toDelete.rec(), toDelete, false, true , true );
+ if( ClientCursor::find(id, false) == 0 ) {
+ cc.release();
+ if( !ok ) {
+ /* we were already at the end. normal. */
+ }
+ else {
+ uasserted(12585, "cursor gone during bg index; dropDups");
+ }
+ break;
+ }
+ }
+ else {
+ log() << "background addExistingToIndex exception " << e.what() << endl;
+ throw;
+ }
+ }
+ n++;
+ progress.hit();
+
+ getDur().commitIfNeeded();
+
+ if ( cc->yieldSometimes( ClientCursor::WillNeed ) ) {
+ progress.setTotalWhileRunning( d->stats.nrecords );
+ }
+ else {
+ cc.release();
+ uasserted(12584, "cursor gone during bg index");
+ break;
+ }
+ }
+ progress.finished();
+ return n;
+ }
+
+ /* we do set a flag in the namespace for quick checking, but this is our authoritative info -
+ that way on a crash/restart, we don't think we are still building one. */
+ set<NamespaceDetails*> bgJobsInProgress;
+
+ void prep(const char *ns, NamespaceDetails *d) {
+ assertInWriteLock();
+ uassert( 13130 , "can't start bg index b/c in recursive lock (db.eval?)" , mongo::d.dbMutex.getState() == 1 );
+ bgJobsInProgress.insert(d);
+ }
+ void done(const char *ns, NamespaceDetails *d) {
+ NamespaceDetailsTransient::get(ns).addedIndex(); // clear query optimizer cache
+ assertInWriteLock();
+ }
+
+ public:
+ BackgroundIndexBuildJob(const char *ns) : BackgroundOperation(ns) { }
+
+ unsigned long long go(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
+ unsigned long long n = 0;
+
+ prep(ns.c_str(), d);
+ assert( idxNo == d->nIndexes );
+ try {
+ idx.head.writing() = idx.idxInterface().addBucket(idx);
+ n = addExistingToIndex(ns.c_str(), d, idx, idxNo);
+ }
+ catch(...) {
+ if( cc().database() && nsdetails(ns.c_str()) == d ) {
+ assert( idxNo == d->nIndexes );
+ done(ns.c_str(), d);
+ }
+ else {
+ log() << "ERROR: db gone during bg index?" << endl;
+ }
+ throw;
+ }
+ assert( idxNo == d->nIndexes );
+ done(ns.c_str(), d);
+ return n;
+ }
+ };
+
+ /**
+ * For the lifetime of this object, an index build is indicated on the specified
+ * namespace and the newest index is marked as absent. This simplifies
+ * the cleanup required on recovery.
+ */
+ class RecoverableIndexState {
+ public:
+ RecoverableIndexState( NamespaceDetails *d ) : _d( d ) {
+ indexBuildInProgress() = 1;
+ nIndexes()--;
+ }
+ ~RecoverableIndexState() {
+ DESTRUCTOR_GUARD (
+ nIndexes()++;
+ indexBuildInProgress() = 0;
+ )
+ }
+ private:
+ int &nIndexes() { return getDur().writingInt( _d->nIndexes ); }
+ int &indexBuildInProgress() { return getDur().writingInt( _d->indexBuildInProgress ); }
+ NamespaceDetails *_d;
+ };
+
+ // throws DBException
+ static void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo, bool background) {
+ tlog() << "build index " << ns << ' ' << idx.keyPattern() << ( background ? " background" : "" ) << endl;
+ Timer t;
+ unsigned long long n;
+
+ assert( !BackgroundOperation::inProgForNs(ns.c_str()) ); // should have been checked earlier, better not be...
+ assert( d->indexBuildInProgress == 0 );
+ assertInWriteLock();
+ RecoverableIndexState recoverable( d );
+
+ // Build index spec here in case the collection is empty and the index details are invalid
+ idx.getSpec();
+
+ if( inDBRepair || !background ) {
+ n = fastBuildIndex(ns.c_str(), d, idx, idxNo);
+ assert( !idx.head.isNull() );
+ }
+ else {
+ BackgroundIndexBuildJob j(ns.c_str());
+ n = j.go(ns, d, idx, idxNo);
+ }
+ tlog() << "build index done " << n << " records " << t.millis() / 1000.0 << " secs" << endl;
+ }
+
+ /* add keys to indexes for a new record */
+#if 0
+ static void oldIndexRecord__notused(NamespaceDetails *d, BSONObj obj, DiskLoc loc) {
+ int n = d->nIndexesBeingBuilt();
+ for ( int i = 0; i < n; i++ ) {
+ try {
+ bool unique = d->idx(i).unique();
+ addKeysToIndex(d, i, obj, loc, /*dupsAllowed*/!unique);
+ }
+ catch( DBException& ) {
+ /* try to roll back previously added index entries
+ note <= i (not < i) is important here as the index we were just attempted
+ may be multikey and require some cleanup.
+ */
+ for( int j = 0; j <= i; j++ ) {
+ try {
+ _unindexRecord(d->idx(j), obj, loc, false);
+ }
+ catch(...) {
+ log(3) << "unindex fails on rollback after unique failure\n";
+ }
+ }
+ throw;
+ }
+ }
+ }
+#endif
+
+ extern BSONObj id_obj; // { _id : 1 }
+
+ void ensureHaveIdIndex(const char *ns) {
+ NamespaceDetails *d = nsdetails(ns);
+ if ( d == 0 || (d->flags & NamespaceDetails::Flag_HaveIdIndex) )
+ return;
+
+ *getDur().writing(&d->flags) |= NamespaceDetails::Flag_HaveIdIndex;
+
+ {
+ NamespaceDetails::IndexIterator i = d->ii();
+ while( i.more() ) {
+ if( i.next().isIdIndex() )
+ return;
+ }
+ }
+
+ string system_indexes = cc().database()->name + ".system.indexes";
+
+ BSONObjBuilder b;
+ b.append("name", "_id_");
+ b.append("ns", ns);
+ b.append("key", id_obj);
+ BSONObj o = b.done();
+
+ /* edge case: note the insert could fail if we have hit maxindexes already */
+ theDataFileMgr.insert(system_indexes.c_str(), o.objdata(), o.objsize(), true);
+ }
+
+#pragma pack(1)
+ struct IDToInsert_ {
+ char type;
+ char _id[4];
+ OID oid;
+ IDToInsert_() {
+ type = (char) jstOID;
+ strcpy(_id, "_id");
+ assert( sizeof(IDToInsert_) == 17 );
+ }
+ } idToInsert_;
+ struct IDToInsert : public BSONElement {
+ IDToInsert() : BSONElement( ( char * )( &idToInsert_ ) ) {}
+ } idToInsert;
+#pragma pack()
+
+ void DataFileMgr::insertAndLog( const char *ns, const BSONObj &o, bool god ) {
+ BSONObj tmp = o;
+ insertWithObjMod( ns, tmp, god );
+ logOp( "i", ns, tmp );
+ }
+
+ /** @param o the object to insert. can be modified to add _id and thus be an in/out param
+ */
+ DiskLoc DataFileMgr::insertWithObjMod(const char *ns, BSONObj &o, bool god) {
+ bool addedID = false;
+ DiskLoc loc = insert( ns, o.objdata(), o.objsize(), god, true, &addedID );
+ if( addedID && !loc.isNull() )
+ o = BSONObj( loc.rec() );
+ return loc;
+ }
+
+ bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject );
+
+ // We are now doing two btree scans for all unique indexes (one here, and one when we've
+ // written the record to the collection. This could be made more efficient inserting
+ // dummy data here, keeping pointers to the btree nodes holding the dummy data and then
+ // updating the dummy data with the DiskLoc of the real record.
+ void checkNoIndexConflicts( NamespaceDetails *d, const BSONObj &obj ) {
+ for ( int idxNo = 0; idxNo < d->nIndexes; idxNo++ ) {
+ if( d->idx(idxNo).unique() ) {
+ IndexDetails& idx = d->idx(idxNo);
+ BSONObjSet keys;
+ idx.getKeysFromObject(obj, keys);
+ BSONObj order = idx.keyPattern();
+ IndexInterface& ii = idx.idxInterface();
+ for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) {
+ // WARNING: findSingle may not be compound index safe. this may need to change. see notes in
+ // findSingle code.
+ uassert( 12582, "duplicate key insert for unique index of capped collection",
+ ii.findSingle(idx, idx.head, *i ).isNull() );
+ }
+ }
+ }
+ }
+
+ /** add a record to the end of the linked list chain within this extent.
+ require: you must have already declared write intent for the record header.
+ */
+ void addRecordToRecListInExtent(Record *r, DiskLoc loc) {
+ dassert( loc.rec() == r );
+ Extent *e = r->myExtent(loc);
+ if ( e->lastRecord.isNull() ) {
+ Extent::FL *fl = getDur().writing(e->fl());
+ fl->firstRecord = fl->lastRecord = loc;
+ r->prevOfs = r->nextOfs = DiskLoc::NullOfs;
+ }
+ else {
+ Record *oldlast = e->lastRecord.rec();
+ r->prevOfs = e->lastRecord.getOfs();
+ r->nextOfs = DiskLoc::NullOfs;
+ getDur().writingInt(oldlast->nextOfs) = loc.getOfs();
+ getDur().writingDiskLoc(e->lastRecord) = loc;
+ }
+ }
+
+ NOINLINE_DECL DiskLoc outOfSpace(const char *ns, NamespaceDetails *d, int lenWHdr, bool god, DiskLoc extentLoc) {
+ DiskLoc loc;
+ if ( d->capped == 0 ) { // size capped doesn't grow
+ log(1) << "allocating new extent for " << ns << " padding:" << d->paddingFactor << " lenWHdr: " << lenWHdr << endl;
+ cc().database()->allocExtent(ns, Extent::followupSize(lenWHdr, d->lastExtentSize), false, !god);
+ loc = d->alloc(ns, lenWHdr, extentLoc);
+ if ( loc.isNull() ) {
+ log() << "warning: alloc() failed after allocating new extent. lenWHdr: " << lenWHdr << " last extent size:" << d->lastExtentSize << "; trying again\n";
+ for ( int z=0; z<10 && lenWHdr > d->lastExtentSize; z++ ) {
+ log() << "try #" << z << endl;
+ cc().database()->allocExtent(ns, Extent::followupSize(lenWHdr, d->lastExtentSize), false, !god);
+ loc = d->alloc(ns, lenWHdr, extentLoc);
+ if ( ! loc.isNull() )
+ break;
+ }
+ }
+ }
+ return loc;
+ }
+
+ /** used by insert and also compact
+ * @return null loc if out of space
+ */
+ DiskLoc allocateSpaceForANewRecord(const char *ns, NamespaceDetails *d, int lenWHdr, bool god) {
+ DiskLoc extentLoc;
+ DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc);
+ if ( loc.isNull() ) {
+ loc = outOfSpace(ns, d, lenWHdr, god, extentLoc);
+ }
+ return loc;
+ }
+
+ bool NOINLINE_DECL insert_checkSys(const char *sys, const char *ns, bool& wouldAddIndex, const void *obuf, bool god) {
+ uassert( 10095 , "attempt to insert in reserved database name 'system'", sys != ns);
+ if ( strstr(ns, ".system.") ) {
+ // later:check for dba-type permissions here if have that at some point separate
+ if ( strstr(ns, ".system.indexes" ) )
+ wouldAddIndex = true;
+ else if ( legalClientSystemNS( ns , true ) ) {
+ if ( obuf && strstr( ns , ".system.users" ) ) {
+ BSONObj t( reinterpret_cast<const char *>( obuf ) );
+ uassert( 14051 , "system.user entry needs 'user' field to be a string" , t["user"].type() == String );
+ uassert( 14052 , "system.user entry needs 'pwd' field to be a string" , t["pwd"].type() == String );
+ uassert( 14053 , "system.user entry needs 'user' field to be non-empty" , t["user"].String().size() );
+ uassert( 14054 , "system.user entry needs 'pwd' field to be non-empty" , t["pwd"].String().size() );
+ }
+ }
+ else if ( !god ) {
+ // todo this should probably uasseert rather than doing this:
+ log() << "ERROR: attempt to insert in system namespace " << ns << endl;
+ return false;
+ }
+ }
+ return true;
+ }
+
+ NOINLINE_DECL NamespaceDetails* insert_newNamespace(const char *ns, int len, bool god) {
+ addNewNamespaceToCatalog(ns);
+ /* todo: shouldn't be in the namespace catalog until after the allocations here work.
+ also if this is an addIndex, those checks should happen before this!
+ */
+ // This may create first file in the database.
+ int ies = Extent::initialSize(len);
+ if( str::contains(ns, '$') && len + Record::HeaderSize >= BtreeData_V1::BucketSize - 256 && len + Record::HeaderSize <= BtreeData_V1::BucketSize + 256 ) {
+ // probably an index. so we pick a value here for the first extent instead of using initialExtentSize() which is more
+ // for user collections. TODO: we could look at the # of records in the parent collection to be smarter here.
+ ies = (32+4) * 1024;
+ }
+ cc().database()->allocExtent(ns, ies, false, false);
+ NamespaceDetails *d = nsdetails(ns);
+ if ( !god )
+ ensureIdIndexForNewNs(ns);
+ return d;
+ }
+
+ void NOINLINE_DECL insert_makeIndex(NamespaceDetails *tableToIndex, const string& tabletoidxns, const DiskLoc& loc) {
+ uassert( 13143 , "can't create index on system.indexes" , tabletoidxns.find( ".system.indexes" ) == string::npos );
+
+ BSONObj info = loc.obj();
+ bool background = info["background"].trueValue();
+ // if this is not readable, let's move things along
+ if (background && ((!theReplSet && cc().isSyncThread()) || (theReplSet && !theReplSet->isSecondary()))) {
+ log() << "info: indexing in foreground on this replica; was a background index build on the primary" << endl;
+ background = false;
+ }
+
+ int idxNo = tableToIndex->nIndexes;
+ IndexDetails& idx = tableToIndex->addIndex(tabletoidxns.c_str(), !background); // clear transient info caches so they refresh; increments nIndexes
+ getDur().writingDiskLoc(idx.info) = loc;
+ try {
+ buildAnIndex(tabletoidxns, tableToIndex, idx, idxNo, background);
+ }
+ catch( DBException& e ) {
+ // save our error msg string as an exception or dropIndexes will overwrite our message
+ LastError *le = lastError.get();
+ int savecode = 0;
+ string saveerrmsg;
+ if ( le ) {
+ savecode = le->code;
+ saveerrmsg = le->msg;
+ }
+ else {
+ savecode = e.getCode();
+ saveerrmsg = e.what();
+ }
+
+ // roll back this index
+ string name = idx.indexName();
+ BSONObjBuilder b;
+ string errmsg;
+ bool ok = dropIndexes(tableToIndex, tabletoidxns.c_str(), name.c_str(), errmsg, b, true);
+ if( !ok ) {
+ log() << "failed to drop index after a unique key error building it: " << errmsg << ' ' << tabletoidxns << ' ' << name << endl;
+ }
+
+ assert( le && !saveerrmsg.empty() );
+ raiseError(savecode,saveerrmsg.c_str());
+ throw;
+ }
+ }
+
+ /* if god==true, you may pass in obuf of NULL and then populate the returned DiskLoc
+ after the call -- that will prevent a double buffer copy in some cases (btree.cpp).
+
+ @param mayAddIndex almost always true, except for invocation from rename namespace command.
+ @param addedID if not null, set to true if adding _id element. you must assure false before calling
+ if using.
+ */
+
+ DiskLoc DataFileMgr::insert(const char *ns, const void *obuf, int len, bool god, bool mayAddIndex, bool *addedID) {
+ bool wouldAddIndex = false;
+ massert( 10093 , "cannot insert into reserved $ collection", god || NamespaceString::normal( ns ) );
+ uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) );
+ {
+ const char *sys = strstr(ns, "system.");
+ if ( sys && !insert_checkSys(sys, ns, wouldAddIndex, obuf, god) )
+ return DiskLoc();
+ }
+ bool addIndex = wouldAddIndex && mayAddIndex;
+
+ NamespaceDetails *d = nsdetails(ns);
+ if ( d == 0 ) {
+ d = insert_newNamespace(ns, len, god);
+ }
+
+ NamespaceDetails *tableToIndex = 0;
+
+ string tabletoidxns;
+ BSONObj fixedIndexObject;
+ if ( addIndex ) {
+ assert( obuf );
+ BSONObj io((const char *) obuf);
+ if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex, fixedIndexObject ) ) {
+ // prepare creates _id itself, or this indicates to fail the build silently (such
+ // as if index already exists)
+ return DiskLoc();
+ }
+ if ( ! fixedIndexObject.isEmpty() ) {
+ obuf = fixedIndexObject.objdata();
+ len = fixedIndexObject.objsize();
+ }
+ }
+
+ int addID = 0; // 0 if not adding _id; if adding, the length of that new element
+ if( !god ) {
+ /* Check if we have an _id field. If we don't, we'll add it.
+ Note that btree buckets which we insert aren't BSONObj's, but in that case god==true.
+ */
+ BSONObj io((const char *) obuf);
+ BSONElement idField = io.getField( "_id" );
+ uassert( 10099 , "_id cannot be an array", idField.type() != Array );
+ // we don't add _id for capped collections as they don't have an _id index
+ if( idField.eoo() && !wouldAddIndex && strstr(ns, ".local.") == 0 && d->haveIdIndex() ) {
+ if( addedID )
+ *addedID = true;
+ addID = len;
+ idToInsert_.oid.init();
+ len += idToInsert.size();
+ }
+
+ BSONElementManipulator::lookForTimestamps( io );
+ }
+
+ int lenWHdr = len + Record::HeaderSize;
+ lenWHdr = (int) (lenWHdr * d->paddingFactor);
+ if ( lenWHdr == 0 ) {
+ // old datafiles, backward compatible here.
+ assert( d->paddingFactor == 0 );
+ *getDur().writing(&d->paddingFactor) = 1.0;
+ lenWHdr = len + Record::HeaderSize;
+ }
+
+ // If the collection is capped, check if the new object will violate a unique index
+ // constraint before allocating space.
+ if ( d->nIndexes && d->capped && !god ) {
+ checkNoIndexConflicts( d, BSONObj( reinterpret_cast<const char *>( obuf ) ) );
+ }
+
+ bool earlyIndex = true;
+ DiskLoc loc;
+ if( addID || tableToIndex || d->capped ) {
+ // if need id, we don't do the early indexing. this is not the common case so that is sort of ok
+ earlyIndex = false;
+ loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god);
+ }
+ else {
+ loc = d->allocWillBeAt(ns, lenWHdr);
+ if( loc.isNull() ) {
+ // need to get a new extent so we have to do the true alloc now (not common case)
+ earlyIndex = false;
+ loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god);
+ }
+ }
+ if ( loc.isNull() ) {
+ log() << "insert: couldn't alloc space for object ns:" << ns << " capped:" << d->capped << endl;
+ assert(d->capped);
+ return DiskLoc();
+ }
+
+ if( earlyIndex ) {
+ // add record to indexes using two step method so we can do the reading outside a write lock
+ if ( d->nIndexes ) {
+ assert( obuf );
+ BSONObj obj((const char *) obuf);
+ try {
+ indexRecordUsingTwoSteps(d, obj, loc, true);
+ }
+ catch( AssertionException& ) {
+ // should be a dup key error on _id index
+ dassert( !tableToIndex && !d->capped );
+ // no need to delete/rollback the record as it was not added yet
+ throw;
+ }
+ }
+ // really allocate now
+ DiskLoc real = allocateSpaceForANewRecord(ns, d, lenWHdr, god);
+ assert( real == loc );
+ }
+
+ Record *r = loc.rec();
+ {
+ assert( r->lengthWithHeaders >= lenWHdr );
+ r = (Record*) getDur().writingPtr(r, lenWHdr);
+ if( addID ) {
+ /* a little effort was made here to avoid a double copy when we add an ID */
+ ((int&)*r->data) = *((int*) obuf) + idToInsert.size();
+ memcpy(r->data+4, idToInsert.rawdata(), idToInsert.size());
+ memcpy(r->data+4+idToInsert.size(), ((char *)obuf)+4, addID-4);
+ }
+ else {
+ if( obuf ) // obuf can be null from internal callers
+ memcpy(r->data, obuf, len);
+ }
+ }
+
+ addRecordToRecListInExtent(r, loc);
+
+ /* durability todo : this could be a bit annoying / slow to record constantly */
+ {
+ NamespaceDetails::Stats *s = getDur().writing(&d->stats);
+ s->datasize += r->netLength();
+ s->nrecords++;
+ }
+
+ // we don't bother resetting query optimizer stats for the god tables - also god is true when adding a btree bucket
+ if ( !god )
+ NamespaceDetailsTransient::get( ns ).notifyOfWriteOp();
+
+ if ( tableToIndex ) {
+ insert_makeIndex(tableToIndex, tabletoidxns, loc);
+ }
+
+ /* add this record to our indexes */
+ if ( !earlyIndex && d->nIndexes ) {
+ try {
+ BSONObj obj(r->data);
+ // not sure which of these is better -- either can be used. oldIndexRecord may be faster,
+ // but twosteps handles dup key errors more efficiently.
+ //oldIndexRecord(d, obj, loc);
+ indexRecordUsingTwoSteps(d, obj, loc, false);
+
+ }
+ catch( AssertionException& e ) {
+ // should be a dup key error on _id index
+ if( tableToIndex || d->capped ) {
+ massert( 12583, "unexpected index insertion failure on capped collection", !d->capped );
+ string s = e.toString();
+ s += " : on addIndex/capped - collection and its index will not match";
+ uassert_nothrow(s.c_str());
+ error() << s << endl;
+ }
+ else {
+ // normal case -- we can roll back
+ _deleteRecord(d, ns, r, loc);
+ throw;
+ }
+ }
+ }
+
+ d->paddingFits();
+
+ return loc;
+ }
+
+ /* special version of insert for transaction logging -- streamlined a bit.
+ assumes ns is capped and no indexes
+ */
+ Record* DataFileMgr::fast_oplog_insert(NamespaceDetails *d, const char *ns, int len) {
+ assert( d );
+ RARELY assert( d == nsdetails(ns) );
+ DEV assert( d == nsdetails(ns) );
+
+ DiskLoc extentLoc;
+ int lenWHdr = len + Record::HeaderSize;
+ DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc);
+ assert( !loc.isNull() );
+
+ Record *r = loc.rec();
+ assert( r->lengthWithHeaders >= lenWHdr );
+
+ Extent *e = r->myExtent(loc);
+ if ( e->lastRecord.isNull() ) {
+ Extent::FL *fl = getDur().writing( e->fl() );
+ fl->firstRecord = fl->lastRecord = loc;
+
+ Record::NP *np = getDur().writing(r->np());
+ np->nextOfs = np->prevOfs = DiskLoc::NullOfs;
+ }
+ else {
+ Record *oldlast = e->lastRecord.rec();
+ Record::NP *np = getDur().writing(r->np());
+ np->prevOfs = e->lastRecord.getOfs();
+ np->nextOfs = DiskLoc::NullOfs;
+ getDur().writingInt( oldlast->nextOfs ) = loc.getOfs();
+ e->lastRecord.writing() = loc;
+ }
+
+ /* todo: don't update for oplog? seems wasteful. */
+ {
+ NamespaceDetails::Stats *s = getDur().writing(&d->stats);
+ s->datasize += r->netLength();
+ s->nrecords++;
+ }
+
+ return r;
+ }
+
+} // namespace mongo
+
+#include "clientcursor.h"
+
+namespace mongo {
+
+ void dropAllDatabasesExceptLocal() {
+ writelock lk("");
+
+ vector<string> n;
+ getDatabaseNames(n);
+ if( n.size() == 0 ) return;
+ log() << "dropAllDatabasesExceptLocal " << n.size() << endl;
+ for( vector<string>::iterator i = n.begin(); i != n.end(); i++ ) {
+ if( *i != "local" ) {
+ Client::Context ctx(*i);
+ dropDatabase(*i);
+ }
+ }
+ }
+
+ void dropDatabase(string db) {
+ log(1) << "dropDatabase " << db << endl;
+ Database *d = cc().database();
+ assert( d );
+ assert( d->name == db );
+
+ BackgroundOperation::assertNoBgOpInProgForDb(d->name.c_str());
+
+ mongo::d.dbMutex.assertWriteLocked();
+
+ // Not sure we need this here, so removed. If we do, we need to move it down
+ // within other calls both (1) as they could be called from elsewhere and
+ // (2) to keep the lock order right - groupcommitmutex must be locked before
+ // mmmutex (if both are locked).
+ //
+ // RWLockRecursive::Exclusive lk(MongoFile::mmmutex);
+
+ getDur().syncDataAndTruncateJournal();
+
+ Database::closeDatabase( d->name.c_str(), d->path );
+ d = 0; // d is now deleted
+
+ _deleteDataFiles( db.c_str() );
+ }
+
+ typedef boost::filesystem::path Path;
+
+ void boostRenameWrapper( const Path &from, const Path &to ) {
+ try {
+ boost::filesystem::rename( from, to );
+ }
+ catch ( const boost::filesystem::filesystem_error & ) {
+ // boost rename doesn't work across partitions
+ boost::filesystem::copy_file( from, to);
+ boost::filesystem::remove( from );
+ }
+ }
+
+ // back up original database files to 'temp' dir
+ void _renameForBackup( const char *database, const Path &reservedPath ) {
+ Path newPath( reservedPath );
+ if ( directoryperdb )
+ newPath /= database;
+ class Renamer : public FileOp {
+ public:
+ Renamer( const Path &newPath ) : newPath_( newPath ) {}
+ private:
+ const boost::filesystem::path &newPath_;
+ virtual bool apply( const Path &p ) {
+ if ( !boost::filesystem::exists( p ) )
+ return false;
+ boostRenameWrapper( p, newPath_ / ( p.leaf() + ".bak" ) );
+ return true;
+ }
+ virtual const char * op() const {
+ return "renaming";
+ }
+ } renamer( newPath );
+ _applyOpToDataFiles( database, renamer, true );
+ }
+
+ // move temp files to standard data dir
+ void _replaceWithRecovered( const char *database, const char *reservedPathString ) {
+ Path newPath( dbpath );
+ if ( directoryperdb )
+ newPath /= database;
+ class Replacer : public FileOp {
+ public:
+ Replacer( const Path &newPath ) : newPath_( newPath ) {}
+ private:
+ const boost::filesystem::path &newPath_;
+ virtual bool apply( const Path &p ) {
+ if ( !boost::filesystem::exists( p ) )
+ return false;
+ boostRenameWrapper( p, newPath_ / p.leaf() );
+ return true;
+ }
+ virtual const char * op() const {
+ return "renaming";
+ }
+ } replacer( newPath );
+ _applyOpToDataFiles( database, replacer, true, reservedPathString );
+ }
+
+ // generate a directory name for storing temp data files
+ Path uniqueReservedPath( const char *prefix ) {
+ Path repairPath = Path( repairpath );
+ Path reservedPath;
+ int i = 0;
+ bool exists = false;
+ do {
+ stringstream ss;
+ ss << prefix << "_repairDatabase_" << i++;
+ reservedPath = repairPath / ss.str();
+ BOOST_CHECK_EXCEPTION( exists = boost::filesystem::exists( reservedPath ) );
+ }
+ while ( exists );
+ return reservedPath;
+ }
+
+ boost::intmax_t dbSize( const char *database ) {
+ class SizeAccumulator : public FileOp {
+ public:
+ SizeAccumulator() : totalSize_( 0 ) {}
+ boost::intmax_t size() const {
+ return totalSize_;
+ }
+ private:
+ virtual bool apply( const boost::filesystem::path &p ) {
+ if ( !boost::filesystem::exists( p ) )
+ return false;
+ totalSize_ += boost::filesystem::file_size( p );
+ return true;
+ }
+ virtual const char *op() const {
+ return "checking size";
+ }
+ boost::intmax_t totalSize_;
+ };
+ SizeAccumulator sa;
+ _applyOpToDataFiles( database, sa );
+ return sa.size();
+ }
+
+ bool repairDatabase( string dbNameS , string &errmsg,
+ bool preserveClonedFilesOnFailure, bool backupOriginalFiles ) {
+ doingRepair dr;
+ dbNameS = nsToDatabase( dbNameS );
+ const char * dbName = dbNameS.c_str();
+
+ stringstream ss;
+ ss << "localhost:" << cmdLine.port;
+ string localhost = ss.str();
+
+ problem() << "repairDatabase " << dbName << endl;
+ assert( cc().database()->name == dbName );
+ assert( cc().database()->path == dbpath );
+
+ BackgroundOperation::assertNoBgOpInProgForDb(dbName);
+
+ getDur().syncDataAndTruncateJournal(); // Must be done before and after repair
+
+ boost::intmax_t totalSize = dbSize( dbName );
+ boost::intmax_t freeSize = File::freeSpace(repairpath);
+ if ( freeSize > -1 && freeSize < totalSize ) {
+ stringstream ss;
+ ss << "Cannot repair database " << dbName << " having size: " << totalSize
+ << " (bytes) because free disk space is: " << freeSize << " (bytes)";
+ errmsg = ss.str();
+ problem() << errmsg << endl;
+ return false;
+ }
+
+ Path reservedPath =
+ uniqueReservedPath( ( preserveClonedFilesOnFailure || backupOriginalFiles ) ?
+ "backup" : "_tmp" );
+ BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( reservedPath ) );
+ string reservedPathString = reservedPath.native_directory_string();
+
+ bool res;
+ {
+ // clone to temp location, which effectively does repair
+ Client::Context ctx( dbName, reservedPathString );
+ assert( ctx.justCreated() );
+
+ res = cloneFrom(localhost.c_str(), errmsg, dbName,
+ /*logForReplication=*/false, /*slaveOk*/false, /*replauth*/false,
+ /*snapshot*/false, /*mayYield*/false, /*mayBeInterrupted*/true);
+ Database::closeDatabase( dbName, reservedPathString.c_str() );
+ }
+
+ if ( !res ) {
+ errmsg = str::stream() << "clone failed for " << dbName << " with error: " << errmsg;
+ problem() << errmsg << endl;
+
+ if ( !preserveClonedFilesOnFailure )
+ BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) );
+
+ getDur().syncDataAndTruncateJournal(); // Must be done before and after repair
+
+ return false;
+ }
+
+ MongoFile::flushAll(true);
+
+ Client::Context ctx( dbName );
+ Database::closeDatabase( dbName, dbpath );
+
+ if ( backupOriginalFiles ) {
+ _renameForBackup( dbName, reservedPath );
+ }
+ else {
+ _deleteDataFiles( dbName );
+ BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( Path( dbpath ) / dbName ) );
+ }
+
+ _replaceWithRecovered( dbName, reservedPathString.c_str() );
+
+ if ( !backupOriginalFiles )
+ BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) );
+
+ getDur().syncDataAndTruncateJournal(); // Must be done before and after repair
+
+ return true;
+ }
+
+ void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator, const string& path ) {
+ if ( afterAllocator )
+ FileAllocator::get()->waitUntilFinished();
+ string c = database;
+ c += '.';
+ boost::filesystem::path p(path);
+ if ( directoryperdb )
+ p /= database;
+ boost::filesystem::path q;
+ q = p / (c+"ns");
+ bool ok = false;
+ BOOST_CHECK_EXCEPTION( ok = fo.apply( q ) );
+ if ( ok )
+ log(2) << fo.op() << " file " << q.string() << endl;
+ int i = 0;
+ int extra = 10; // should not be necessary, this is defensive in case there are missing files
+ while ( 1 ) {
+ assert( i <= DiskLoc::MaxFiles );
+ stringstream ss;
+ ss << c << i;
+ q = p / ss.str();
+ BOOST_CHECK_EXCEPTION( ok = fo.apply(q) );
+ if ( ok ) {
+ if ( extra != 10 ) {
+ log(1) << fo.op() << " file " << q.string() << endl;
+ log() << " _applyOpToDataFiles() warning: extra == " << extra << endl;
+ }
+ }
+ else if ( --extra <= 0 )
+ break;
+ i++;
+ }
+ }
+
+ NamespaceDetails* nsdetails_notinline(const char *ns) { return nsdetails(ns); }
+
+ bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result , bool force ) {
+ log() << "DatabaseHolder::closeAll path:" << path << endl;
+ d.dbMutex.assertWriteLocked();
+
+ map<string,Database*>& m = _paths[path];
+ _size -= m.size();
+
+ set< string > dbs;
+ for ( map<string,Database*>::iterator i = m.begin(); i != m.end(); i++ ) {
+ wassert( i->second->path == path );
+ dbs.insert( i->first );
+ }
+
+ currentClient.get()->getContext()->_clear();
+
+ BSONObjBuilder bb( result.subarrayStart( "dbs" ) );
+ int n = 0;
+ int nNotClosed = 0;
+ for( set< string >::iterator i = dbs.begin(); i != dbs.end(); ++i ) {
+ string name = *i;
+ log(2) << "DatabaseHolder::closeAll path:" << path << " name:" << name << endl;
+ Client::Context ctx( name , path );
+ if( !force && BackgroundOperation::inProgForDb(name.c_str()) ) {
+ log() << "WARNING: can't close database " << name << " because a bg job is in progress - try killOp command" << endl;
+ nNotClosed++;
+ }
+ else {
+ Database::closeDatabase( name.c_str() , path );
+ bb.append( bb.numStr( n++ ) , name );
+ }
+ }
+ bb.done();
+ if( nNotClosed )
+ result.append("nNotClosed", nNotClosed);
+ else {
+ ClientCursor::assertNoCursors();
+ }
+
+ return true;
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/pdfile.h b/src/mongo/db/pdfile.h
new file mode 100644
index 00000000000..cd6062b1a48
--- /dev/null
+++ b/src/mongo/db/pdfile.h
@@ -0,0 +1,546 @@
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* pdfile.h
+
+ Files:
+ database.ns - namespace index
+ database.1 - data files
+ database.2
+ ...
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "../util/mmap.h"
+#include "diskloc.h"
+#include "jsobjmanipulator.h"
+#include "namespace-inl.h"
+#include "client.h"
+#include "mongommf.h"
+
+namespace mongo {
+
+ class DataFileHeader;
+ class Extent;
+ class Record;
+ class Cursor;
+ class OpDebug;
+
+ void dropDatabase(string db);
+ bool repairDatabase(string db, string &errmsg, bool preserveClonedFilesOnFailure = false, bool backupOriginalFiles = false);
+
+ /* low level - only drops this ns */
+ void dropNS(const string& dropNs);
+
+ /* deletes this ns, indexes and cursors */
+ void dropCollection( const string &name, string &errmsg, BSONObjBuilder &result );
+ bool userCreateNS(const char *ns, BSONObj j, string& err, bool logForReplication, bool *deferIdIndex = 0);
+ shared_ptr<Cursor> findTableScan(const char *ns, const BSONObj& order, const DiskLoc &startLoc=DiskLoc());
+
+ bool isValidNS( const StringData& ns );
+
+ /*---------------------------------------------------------------------*/
+
+ class MongoDataFile {
+ friend class DataFileMgr;
+ friend class BasicCursor;
+ public:
+ MongoDataFile(int fn) : _mb(0), fileNo(fn) { }
+
+ /** @return true if found and opened. if uninitialized (prealloc only) does not open. */
+ bool openExisting( const char *filename );
+
+ /** creates if DNE */
+ void open(const char *filename, int requestedDataSize = 0, bool preallocateOnly = false);
+
+ /* allocate a new extent from this datafile.
+ @param capped - true if capped collection
+ @param loops is our recursion check variable - you want to pass in zero
+ */
+ Extent* createExtent(const char *ns, int approxSize, bool capped = false, int loops = 0);
+
+ DataFileHeader *getHeader() { return header(); }
+
+ unsigned long long length() const { return mmf.length(); }
+
+ /* return max size an extent may be */
+ static int maxSize();
+
+ /** fsync */
+ void flush( bool sync );
+
+ /** only use fore debugging */
+ Extent* debug_getExtent(DiskLoc loc) { return _getExtent( loc ); }
+ private:
+ void badOfs(int) const;
+ void badOfs2(int) const;
+ int defaultSize( const char *filename ) const;
+
+ Extent* getExtent(DiskLoc loc) const;
+ Extent* _getExtent(DiskLoc loc) const;
+ Record* recordAt(DiskLoc dl);
+ Record* makeRecord(DiskLoc dl, int size);
+ void grow(DiskLoc dl, int size);
+
+ char* p() const { return (char *) _mb; }
+ DataFileHeader* header() { return (DataFileHeader*) _mb; }
+
+ MongoMMF mmf;
+ void *_mb; // the memory mapped view
+ int fileNo;
+ };
+
+ class DataFileMgr {
+ friend class BasicCursor;
+ public:
+ void init(const string& path );
+
+ /* see if we can find an extent of the right size in the freelist. */
+ static Extent* allocFromFreeList(const char *ns, int approxSize, bool capped = false);
+
+ /** @return DiskLoc where item ends up */
+ // changedId should be initialized to false
+ const DiskLoc updateRecord(
+ const char *ns,
+ NamespaceDetails *d,
+ NamespaceDetailsTransient *nsdt,
+ Record *toupdate, const DiskLoc& dl,
+ const char *buf, int len, OpDebug& debug, bool god=false);
+
+ // The object o may be updated if modified on insert.
+ void insertAndLog( const char *ns, const BSONObj &o, bool god = false );
+
+ /** insert will add an _id to the object if not present. if you would like to see the final object
+ after such an addition, use this method.
+ @param o both and in and out param
+ */
+ DiskLoc insertWithObjMod(const char *ns, BSONObj & /*out*/o, bool god = false);
+
+ /** @param obj in value only for this version. */
+ void insertNoReturnVal(const char *ns, BSONObj o, bool god = false);
+
+ DiskLoc insert(const char *ns, const void *buf, int len, bool god = false, bool mayAddIndex = true, bool *addedID = 0);
+ static shared_ptr<Cursor> findAll(const char *ns, const DiskLoc &startLoc = DiskLoc());
+
+ /* special version of insert for transaction logging -- streamlined a bit.
+ assumes ns is capped and no indexes
+ no _id field check
+ */
+ Record* fast_oplog_insert(NamespaceDetails *d, const char *ns, int len);
+
+ static Extent* getExtent(const DiskLoc& dl);
+ static Record* getRecord(const DiskLoc& dl);
+ static DeletedRecord* makeDeletedRecord(const DiskLoc& dl, int len);
+
+ void deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK = false, bool noWarn = false, bool logOp=false);
+
+ /* does not clean up indexes, etc. : just deletes the record in the pdfile. use deleteRecord() to unindex */
+ void _deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl);
+
+ private:
+ vector<MongoDataFile *> files;
+ };
+
+ extern DataFileMgr theDataFileMgr;
+
+#pragma pack(1)
+
+ class DeletedRecord {
+ public:
+ int lengthWithHeaders;
+ int extentOfs;
+ DiskLoc nextDeleted;
+ DiskLoc myExtentLoc(const DiskLoc& myLoc) const {
+ return DiskLoc(myLoc.a(), extentOfs);
+ }
+ Extent* myExtent(const DiskLoc& myLoc) {
+ return DataFileMgr::getExtent(DiskLoc(myLoc.a(), extentOfs));
+ }
+ };
+
+ /* Record is a record in a datafile. DeletedRecord is similar but for deleted space.
+
+ *11:03:20 AM) dm10gen: regarding extentOfs...
+ (11:03:42 AM) dm10gen: an extent is a continugous disk area, which contains many Records and DeleteRecords
+ (11:03:56 AM) dm10gen: a DiskLoc has two pieces, the fileno and ofs. (64 bit total)
+ (11:04:16 AM) dm10gen: to keep the headesr small, instead of storing a 64 bit ptr to the full extent address, we keep just the offset
+ (11:04:29 AM) dm10gen: we can do this as we know the record's address, and it has the same fileNo
+ (11:04:33 AM) dm10gen: see class DiskLoc for more info
+ (11:04:43 AM) dm10gen: so that is how Record::myExtent() works
+ (11:04:53 AM) dm10gen: on an alloc(), when we build a new Record, we must populate its extentOfs then
+ */
+ class Record {
+ public:
+ enum HeaderSizeValue { HeaderSize = 16 };
+ int lengthWithHeaders;
+ int extentOfs;
+ int nextOfs;
+ int prevOfs;
+
+ /** be careful when referencing this that your write intent was correct */
+ char data[4];
+
+ int netLength() {
+ return lengthWithHeaders - HeaderSize;
+ }
+ //void setNewLength(int netlen) { lengthWithHeaders = netlen + HeaderSize; }
+
+ /* use this when a record is deleted. basically a union with next/prev fields */
+ DeletedRecord& asDeleted() { return *((DeletedRecord*) this); }
+
+ Extent* myExtent(const DiskLoc& myLoc) { return DataFileMgr::getExtent(DiskLoc(myLoc.a(), extentOfs)); }
+
+ /* get the next record in the namespace, traversing extents as necessary */
+ DiskLoc getNext(const DiskLoc& myLoc);
+ DiskLoc getPrev(const DiskLoc& myLoc);
+
+ DiskLoc nextInExtent(const DiskLoc& myLoc) {
+ if ( nextOfs == DiskLoc::NullOfs )
+ return DiskLoc();
+ assert( nextOfs );
+ return DiskLoc(myLoc.a(), nextOfs);
+ }
+
+ struct NP {
+ int nextOfs;
+ int prevOfs;
+ };
+ NP* np() { return (NP*) &nextOfs; }
+
+ // ---------------------
+ // memory cache
+ // ---------------------
+
+ /**
+ * touches the data so that is in physical memory
+ * @param entireRecrd if false, only the header and first byte is touched
+ * if true, the entire record is touched
+ * */
+ void touch( bool entireRecrd = false );
+
+ /**
+ * @return if this record is likely in physical memory
+ * its not guaranteed because its possible it gets swapped out in a very unlucky windows
+ */
+ bool likelyInPhysicalMemory();
+
+ /**
+ * tell the cache this Record was accessed
+ * @return this, for simple chaining
+ */
+ Record* accessed();
+
+ static bool MemoryTrackingEnabled;
+ };
+
+ /* extents are datafile regions where all the records within the region
+ belong to the same namespace.
+
+ (11:12:35 AM) dm10gen: when the extent is allocated, all its empty space is stuck into one big DeletedRecord
+ (11:12:55 AM) dm10gen: and that is placed on the free list
+ */
+ class Extent {
+ public:
+ unsigned magic;
+ DiskLoc myLoc;
+ DiskLoc xnext, xprev; /* next/prev extent for this namespace */
+
+ /* which namespace this extent is for. this is just for troubleshooting really
+ and won't even be correct if the collection were renamed!
+ */
+ Namespace nsDiagnostic;
+
+ int length; /* size of the extent, including these fields */
+ DiskLoc firstRecord;
+ DiskLoc lastRecord;
+ char _extentData[4];
+
+ static int HeaderSize() { return sizeof(Extent)-4; }
+
+ bool validates() {
+ return !(firstRecord.isNull() ^ lastRecord.isNull()) &&
+ length >= 0 && !myLoc.isNull();
+ }
+
+ BSONObj dump() {
+ return BSON( "loc" << myLoc.toString() << "xnext" << xnext.toString() << "xprev" << xprev.toString()
+ << "nsdiag" << nsDiagnostic.toString()
+ << "size" << length << "firstRecord" << firstRecord.toString() << "lastRecord" << lastRecord.toString());
+ }
+
+ void dump(iostream& s) {
+ s << " loc:" << myLoc.toString() << " xnext:" << xnext.toString() << " xprev:" << xprev.toString() << '\n';
+ s << " nsdiag:" << nsDiagnostic.toString() << '\n';
+ s << " size:" << length << " firstRecord:" << firstRecord.toString() << " lastRecord:" << lastRecord.toString() << '\n';
+ }
+
+ /* assumes already zeroed -- insufficient for block 'reuse' perhaps
+ Returns a DeletedRecord location which is the data in the extent ready for us.
+ Caller will need to add that to the freelist structure in namespacedetail.
+ */
+ DiskLoc init(const char *nsname, int _length, int _fileNo, int _offset, bool capped);
+
+ /* like init(), but for a reuse case */
+ DiskLoc reuse(const char *nsname, bool newUseIsAsCapped);
+
+ bool isOk() const { return magic == 0x41424344; }
+ void assertOk() const { assert(isOk()); }
+
+ Record* newRecord(int len);
+
+ Record* getRecord(DiskLoc dl) {
+ assert( !dl.isNull() );
+ assert( dl.sameFile(myLoc) );
+ int x = dl.getOfs() - myLoc.getOfs();
+ assert( x > 0 );
+ return (Record *) (((char *) this) + x);
+ }
+
+ Extent* getNextExtent() { return xnext.isNull() ? 0 : DataFileMgr::getExtent(xnext); }
+ Extent* getPrevExtent() { return xprev.isNull() ? 0 : DataFileMgr::getExtent(xprev); }
+
+ static int maxSize();
+ static int minSize() { return 0x100; }
+ /**
+ * @param len lengt of record we need
+ * @param lastRecord size of last extent which is a factor in next extent size
+ */
+ static int followupSize(int len, int lastExtentLen);
+
+ /** get a suggested size for the first extent in a namespace
+ * @param len length of record we need to insert
+ */
+ static int initialSize(int len);
+
+ struct FL {
+ DiskLoc firstRecord;
+ DiskLoc lastRecord;
+ };
+ /** often we want to update just the firstRecord and lastRecord fields.
+ this helper is for that -- for use with getDur().writing() method
+ */
+ FL* fl() { return (FL*) &firstRecord; }
+
+ /** caller must declare write intent first */
+ void markEmpty();
+ private:
+ DiskLoc _reuse(const char *nsname, bool newUseIsAsCapped); // recycle an extent and reuse it for a different ns
+ };
+
+ /* a datafile - i.e. the "dbname.<#>" files :
+
+ ----------------------
+ DataFileHeader
+ ----------------------
+ Extent (for a particular namespace)
+ Record
+ ...
+ Record (some chained for unused space)
+ ----------------------
+ more Extents...
+ ----------------------
+ */
+ class DataFileHeader {
+ public:
+ int version;
+ int versionMinor;
+ int fileLength;
+ DiskLoc unused; /* unused is the portion of the file that doesn't belong to any allocated extents. -1 = no more */
+ int unusedLength;
+ char reserved[8192 - 4*4 - 8];
+
+ char data[4]; // first extent starts here
+
+ enum { HeaderSize = 8192 };
+
+ bool isCurrentVersion() const { return ( version == PDFILE_VERSION ) && ( versionMinor == PDFILE_VERSION_MINOR ); }
+
+ bool uninitialized() const { return version == 0; }
+
+ void init(int fileno, int filelength, const char* filename) {
+ if ( uninitialized() ) {
+ DEV log() << "datafileheader::init initializing " << filename << " n:" << fileno << endl;
+ if( !(filelength > 32768 ) ) {
+ massert(13640, str::stream() << "DataFileHeader looks corrupt at file open filelength:" << filelength << " fileno:" << fileno, false);
+ }
+
+ {
+ if( !d.dbMutex.isWriteLocked() ) {
+ log() << "*** TEMP NOT INITIALIZING FILE " << filename << ", not in a write lock." << endl;
+ log() << "temp bypass until more elaborate change - case that is manifesting is benign anyway" << endl;
+ return;
+/**
+ log() << "ERROR can't create outside a write lock" << endl;
+ printStackTrace();
+ ::abort();
+**/
+ }
+ }
+
+ getDur().createdFile(filename, filelength);
+ assert( HeaderSize == 8192 );
+ DataFileHeader *h = getDur().writing(this);
+ h->fileLength = filelength;
+ h->version = PDFILE_VERSION;
+ h->versionMinor = PDFILE_VERSION_MINOR;
+ h->unused.set( fileno, HeaderSize );
+ assert( (data-(char*)this) == HeaderSize );
+ h->unusedLength = fileLength - HeaderSize - 16;
+ }
+ }
+
+ bool isEmpty() const {
+ return uninitialized() || ( unusedLength == fileLength - HeaderSize - 16 );
+ }
+ };
+
+#pragma pack()
+
+ inline Extent* MongoDataFile::_getExtent(DiskLoc loc) const {
+ loc.assertOk();
+ Extent *e = (Extent *) (p()+loc.getOfs());
+ return e;
+ }
+
+ inline Extent* MongoDataFile::getExtent(DiskLoc loc) const {
+ Extent *e = _getExtent(loc);
+ e->assertOk();
+ return e;
+ }
+
+} // namespace mongo
+
+#include "cursor.h"
+
+namespace mongo {
+
+ inline Record* MongoDataFile::recordAt(DiskLoc dl) {
+ int ofs = dl.getOfs();
+ if( ofs < DataFileHeader::HeaderSize ) badOfs(ofs); // will uassert - external call to keep out of the normal code path
+ return (Record*) (p()+ofs);
+ }
+
+ inline Record* MongoDataFile::makeRecord(DiskLoc dl, int size) {
+ int ofs = dl.getOfs();
+ if( ofs < DataFileHeader::HeaderSize ) badOfs(ofs); // will uassert - external call to keep out of the normal code path
+ return (Record*) (p()+ofs);
+ }
+
+ inline DiskLoc Record::getNext(const DiskLoc& myLoc) {
+ if ( nextOfs != DiskLoc::NullOfs ) {
+ /* defensive */
+ if ( nextOfs >= 0 && nextOfs < 10 ) {
+ sayDbContext("Assertion failure - Record::getNext() referencing a deleted record?");
+ return DiskLoc();
+ }
+
+ return DiskLoc(myLoc.a(), nextOfs);
+ }
+ Extent *e = myExtent(myLoc);
+ while ( 1 ) {
+ if ( e->xnext.isNull() )
+ return DiskLoc(); // end of table.
+ e = e->xnext.ext();
+ if ( !e->firstRecord.isNull() )
+ break;
+ // entire extent could be empty, keep looking
+ }
+ return e->firstRecord;
+ }
+ inline DiskLoc Record::getPrev(const DiskLoc& myLoc) {
+ if ( prevOfs != DiskLoc::NullOfs )
+ return DiskLoc(myLoc.a(), prevOfs);
+ Extent *e = myExtent(myLoc);
+ if ( e->xprev.isNull() )
+ return DiskLoc();
+ return e->xprev.ext()->lastRecord;
+ }
+
+ inline BSONObj DiskLoc::obj() const {
+ return BSONObj(rec()->accessed());
+ }
+ inline DeletedRecord* DiskLoc::drec() const {
+ assert( _a != -1 );
+ return (DeletedRecord*) rec();
+ }
+ inline Extent* DiskLoc::ext() const {
+ return DataFileMgr::getExtent(*this);
+ }
+
+ template< class V >
+ inline
+ const BtreeBucket<V> * DiskLoc::btree() const {
+ assert( _a != -1 );
+ return (const BtreeBucket<V> *) rec()->data;
+ }
+
+} // namespace mongo
+
+#include "database.h"
+
+namespace mongo {
+
+ boost::intmax_t dbSize( const char *database );
+
+ inline NamespaceIndex* nsindex(const char *ns) {
+ Database *database = cc().database();
+ assert( database );
+ DEV {
+ char buf[256];
+ nsToDatabase(ns, buf);
+ if ( database->name != buf ) {
+ out() << "ERROR: attempt to write to wrong database\n";
+ out() << " ns:" << ns << '\n';
+ out() << " database->name:" << database->name << endl;
+ assert( database->name == buf );
+ }
+ }
+ return &database->namespaceIndex;
+ }
+
+ inline NamespaceDetails* nsdetails(const char *ns) {
+ // if this faults, did you set the current db first? (Client::Context + dblock)
+ return nsindex(ns)->details(ns);
+ }
+
+ inline Extent* DataFileMgr::getExtent(const DiskLoc& dl) {
+ assert( dl.a() != -1 );
+ return cc().database()->getFile(dl.a())->getExtent(dl);
+ }
+
+ inline Record* DataFileMgr::getRecord(const DiskLoc& dl) {
+ assert( dl.a() != -1 );
+ return cc().database()->getFile(dl.a())->recordAt(dl);
+ }
+
+ BOOST_STATIC_ASSERT( 16 == sizeof(DeletedRecord) );
+
+ inline DeletedRecord* DataFileMgr::makeDeletedRecord(const DiskLoc& dl, int len) {
+ assert( dl.a() != -1 );
+ return (DeletedRecord*) cc().database()->getFile(dl.a())->makeRecord(dl, sizeof(DeletedRecord));
+ }
+
+ void ensureHaveIdIndex(const char *ns);
+
+ bool dropIndexes( NamespaceDetails *d, const char *ns, const char *name, string &errmsg, BSONObjBuilder &anObjBuilder, bool maydeleteIdIndex );
+
+ inline BSONObj::BSONObj(const Record *r) {
+ init(r->data);
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/pipeline/accumulator.cpp b/src/mongo/db/pipeline/accumulator.cpp
new file mode 100755
index 00000000000..9ef8aa39470
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator.cpp
@@ -0,0 +1,92 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/accumulator.h"
+
+#include "db/jsobj.h"
+#include "util/mongoutils/str.h"
+
+namespace mongo {
+ using namespace mongoutils;
+
+ void Accumulator::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ uassert(15943, str::stream() << "group accumulator " <<
+ getOpName() << " only accepts one operand",
+ vpOperand.size() < 1);
+
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ Accumulator::Accumulator():
+ ExpressionNary() {
+ }
+
+ void Accumulator::opToBson(
+ BSONObjBuilder *pBuilder, string opName,
+ string fieldName, unsigned depth) const {
+ assert(vpOperand.size() == 1);
+ BSONObjBuilder builder;
+ vpOperand[0]->addToBsonObj(&builder, opName, depth);
+ pBuilder->append(fieldName, builder.done());
+ }
+
+ void Accumulator::addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+ opToBson(pBuilder, getOpName(), fieldName, depth);
+ }
+
+ void Accumulator::addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const {
+ assert(false); // these can't appear in arrays
+ }
+
+ void agg_framework_reservedErrors() {
+ uassert(16017, "reserved error", false);
+ uassert(16018, "reserved error", false);
+ uassert(16019, "reserved error", false);
+ uassert(16020, "reserved error", false);
+ uassert(16021, "reserved error", false);
+ uassert(16022, "reserved error", false);
+ uassert(16023, "reserved error", false);
+ uassert(16024, "reserved error", false);
+ uassert(16025, "reserved error", false);
+ uassert(16026, "reserved error", false);
+ uassert(16027, "reserved error", false);
+ uassert(16028, "reserved error", false);
+ uassert(16029, "reserved error", false);
+ uassert(16030, "reserved error", false);
+ uassert(16031, "reserved error", false);
+ uassert(16032, "reserved error", false);
+ uassert(16033, "reserved error", false);
+
+ uassert(16036, "reserved error", false);
+ uassert(16037, "reserved error", false);
+ uassert(16038, "reserved error", false);
+ uassert(16039, "reserved error", false);
+ uassert(16040, "reserved error", false);
+ uassert(16041, "reserved error", false);
+ uassert(16042, "reserved error", false);
+ uassert(16043, "reserved error", false);
+ uassert(16044, "reserved error", false);
+ uassert(16045, "reserved error", false);
+ uassert(16046, "reserved error", false);
+ uassert(16047, "reserved error", false);
+ uassert(16048, "reserved error", false);
+ uassert(16049, "reserved error", false);
+ }
+}
diff --git a/src/mongo/db/pipeline/accumulator.h b/src/mongo/db/pipeline/accumulator.h
new file mode 100755
index 00000000000..a75b2c9abaa
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator.h
@@ -0,0 +1,259 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+#include <boost/unordered_set.hpp>
+#include "db/pipeline/value.h"
+#include "db/pipeline/expression.h"
+#include "bson/bsontypes.h"
+
+namespace mongo {
+ class ExpressionContext;
+
+ class Accumulator :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+ virtual void addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+ virtual void addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+ /*
+ Get the accumulated value.
+
+ @returns the accumulated value
+ */
+ virtual intrusive_ptr<const Value> getValue() const = 0;
+
+ protected:
+ Accumulator();
+
+ /*
+ Convenience method for doing this for accumulators. The pattern
+ is always the same, so a common implementation works, but requires
+ knowing the operator name.
+
+ @param pBuilder the builder to add to
+ @param fieldName the projected name
+ @param opName the operator name
+ */
+ void opToBson(
+ BSONObjBuilder *pBuilder, string fieldName, string opName,
+ unsigned depth) const;
+ };
+
+
+ class AccumulatorAddToSet :
+ public Accumulator {
+ public:
+ // virtuals from Expression
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual intrusive_ptr<const Value> getValue() const;
+ virtual const char *getOpName() const;
+
+ /*
+ Create an appending accumulator.
+
+ @param pCtx the expression context
+ @returns the created accumulator
+ */
+ static intrusive_ptr<Accumulator> create(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ private:
+ AccumulatorAddToSet(const intrusive_ptr<ExpressionContext> &pTheCtx);
+ typedef boost::unordered_set<intrusive_ptr<const Value>, Value::Hash > SetType;
+ mutable SetType set;
+ mutable SetType::iterator itr;
+ intrusive_ptr<ExpressionContext> pCtx;
+ };
+
+
+ /*
+ This isn't a finished accumulator, but rather a convenient base class
+ for others such as $first, $last, $max, $min, and similar. It just
+ provides a holder for a single Value, and the getter for that. The
+ holder is protected so derived classes can manipulate it.
+ */
+ class AccumulatorSingleValue :
+ public Accumulator {
+ public:
+ // virtuals from Expression
+ virtual intrusive_ptr<const Value> getValue() const;
+
+ protected:
+ AccumulatorSingleValue();
+
+ mutable intrusive_ptr<const Value> pValue; /* current min/max */
+ };
+
+
+ class AccumulatorFirst :
+ public AccumulatorSingleValue {
+ public:
+ // virtuals from Expression
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+
+ /*
+ Create the accumulator.
+
+ @returns the created accumulator
+ */
+ static intrusive_ptr<Accumulator> create(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ private:
+ AccumulatorFirst();
+ };
+
+
+ class AccumulatorLast :
+ public AccumulatorSingleValue {
+ public:
+ // virtuals from Expression
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+
+ /*
+ Create the accumulator.
+
+ @returns the created accumulator
+ */
+ static intrusive_ptr<Accumulator> create(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ private:
+ AccumulatorLast();
+ };
+
+
+ class AccumulatorSum :
+ public Accumulator {
+ public:
+ // virtuals from Accumulator
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual intrusive_ptr<const Value> getValue() const;
+ virtual const char *getOpName() const;
+
+ /*
+ Create a summing accumulator.
+
+ @param pCtx the expression context
+ @returns the created accumulator
+ */
+ static intrusive_ptr<Accumulator> create(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ protected: /* reused by AccumulatorAvg */
+ AccumulatorSum();
+
+ mutable BSONType totalType;
+ mutable long long longTotal;
+ mutable double doubleTotal;
+ };
+
+
+ class AccumulatorMinMax :
+ public AccumulatorSingleValue {
+ public:
+ // virtuals from Expression
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+
+ /*
+ Create either the max or min accumulator.
+
+ @returns the created accumulator
+ */
+ static intrusive_ptr<Accumulator> createMin(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+ static intrusive_ptr<Accumulator> createMax(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ private:
+ AccumulatorMinMax(int theSense);
+
+ int sense; /* 1 for min, -1 for max; used to "scale" comparison */
+ };
+
+
+ class AccumulatorPush :
+ public Accumulator {
+ public:
+ // virtuals from Expression
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual intrusive_ptr<const Value> getValue() const;
+ virtual const char *getOpName() const;
+
+ /*
+ Create an appending accumulator.
+
+ @param pCtx the expression context
+ @returns the created accumulator
+ */
+ static intrusive_ptr<Accumulator> create(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ private:
+ AccumulatorPush(const intrusive_ptr<ExpressionContext> &pTheCtx);
+
+ mutable vector<intrusive_ptr<const Value> > vpValue;
+ intrusive_ptr<ExpressionContext> pCtx;
+ };
+
+
+ class AccumulatorAvg :
+ public AccumulatorSum {
+ typedef AccumulatorSum Super;
+ public:
+ // virtuals from Accumulator
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual intrusive_ptr<const Value> getValue() const;
+ virtual const char *getOpName() const;
+
+ /*
+ Create an averaging accumulator.
+
+ @param pCtx the expression context
+ @returns the created accumulator
+ */
+ static intrusive_ptr<Accumulator> create(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ private:
+ static const char subTotalName[];
+ static const char countName[];
+
+ AccumulatorAvg(const intrusive_ptr<ExpressionContext> &pCtx);
+
+ mutable long long count;
+ intrusive_ptr<ExpressionContext> pCtx;
+ };
+
+}
diff --git a/src/mongo/db/pipeline/accumulator_add_to_set.cpp b/src/mongo/db/pipeline/accumulator_add_to_set.cpp
new file mode 100755
index 00000000000..94df0293de4
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_add_to_set.cpp
@@ -0,0 +1,79 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+ intrusive_ptr<const Value> AccumulatorAddToSet::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ assert(vpOperand.size() == 1);
+ intrusive_ptr<const Value> prhs(vpOperand[0]->evaluate(pDocument));
+
+ if (prhs->getType() == Undefined)
+ ; /* nothing to add to the array */
+ else if (!pCtx->getInRouter())
+ set.insert(prhs);
+ else {
+ /*
+ If we're in the router, we need to take apart the arrays we
+ receive and put their elements into the array we are collecting.
+ If we didn't, then we'd get an array of arrays, with one array
+ from each shard that responds.
+ */
+ assert(prhs->getType() == Array);
+
+ intrusive_ptr<ValueIterator> pvi(prhs->getArray());
+ while(pvi->more()) {
+ intrusive_ptr<const Value> pElement(pvi->next());
+ set.insert(pElement);
+ }
+ }
+
+ return Value::getNull();
+ }
+
+ intrusive_ptr<const Value> AccumulatorAddToSet::getValue() const {
+ vector<intrusive_ptr<const Value> > valVec;
+
+ for (itr = set.begin(); itr != set.end(); ++itr) {
+ valVec.push_back(*itr);
+ }
+ /* there is no issue of scope since createArray copy constructs */
+ return Value::createArray(valVec);
+ }
+
+ AccumulatorAddToSet::AccumulatorAddToSet(
+ const intrusive_ptr<ExpressionContext> &pTheCtx):
+ Accumulator(),
+ set(),
+ pCtx(pTheCtx) {
+ }
+
+ intrusive_ptr<Accumulator> AccumulatorAddToSet::create(
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ intrusive_ptr<AccumulatorAddToSet> pAccumulator(
+ new AccumulatorAddToSet(pCtx));
+ return pAccumulator;
+ }
+
+ const char *AccumulatorAddToSet::getOpName() const {
+ return "$addToSet";
+ }
+}
diff --git a/src/mongo/db/pipeline/accumulator_avg.cpp b/src/mongo/db/pipeline/accumulator_avg.cpp
new file mode 100755
index 00000000000..9f18b1820c8
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_avg.cpp
@@ -0,0 +1,123 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+ const char AccumulatorAvg::subTotalName[] = "subTotal";
+ const char AccumulatorAvg::countName[] = "count";
+
+ intrusive_ptr<const Value> AccumulatorAvg::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ if (!pCtx->getInRouter()) {
+ Super::evaluate(pDocument);
+ ++count;
+ }
+ else {
+ /*
+ If we're in the router, we expect an object that contains
+ both a subtotal and a count. This is what getValue() produced
+ below.
+ */
+ intrusive_ptr<const Value> prhs(
+ vpOperand[0]->evaluate(pDocument));
+ assert(prhs->getType() == Object);
+ intrusive_ptr<Document> pShardDoc(prhs->getDocument());
+
+ intrusive_ptr<const Value> pSubTotal(
+ pShardDoc->getValue(subTotalName));
+ assert(pSubTotal.get());
+ BSONType subTotalType = pSubTotal->getType();
+ if ((totalType == NumberLong) || (subTotalType == NumberLong))
+ totalType = NumberLong;
+ if ((totalType == NumberDouble) || (subTotalType == NumberDouble))
+ totalType = NumberDouble;
+
+ if (subTotalType == NumberInt) {
+ int v = pSubTotal->getInt();
+ longTotal += v;
+ doubleTotal += v;
+ }
+ else if (subTotalType == NumberLong) {
+ long long v = pSubTotal->getLong();
+ longTotal += v;
+ doubleTotal += v;
+ }
+ else {
+ double v = pSubTotal->getDouble();
+ doubleTotal += v;
+ }
+
+ intrusive_ptr<const Value> pCount(pShardDoc->getValue(countName));
+ count += pCount->getLong();
+ }
+
+ return Value::getZero();
+ }
+
+ intrusive_ptr<Accumulator> AccumulatorAvg::create(
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ intrusive_ptr<AccumulatorAvg> pA(new AccumulatorAvg(pCtx));
+ return pA;
+ }
+
+ intrusive_ptr<const Value> AccumulatorAvg::getValue() const {
+ if (!pCtx->getInShard()) {
+ double avg = 0;
+ if (count) {
+ if (totalType != NumberDouble)
+ avg = static_cast<double>(longTotal / count);
+ else
+ avg = doubleTotal / count;
+ }
+
+ return Value::createDouble(avg);
+ }
+
+ intrusive_ptr<Document> pDocument(Document::create());
+
+ intrusive_ptr<const Value> pSubTotal;
+ if (totalType == NumberInt)
+ pSubTotal = Value::createInt((int)longTotal);
+ else if (totalType == NumberLong)
+ pSubTotal = Value::createLong(longTotal);
+ else
+ pSubTotal = Value::createDouble(doubleTotal);
+ pDocument->addField(subTotalName, pSubTotal);
+
+ intrusive_ptr<const Value> pCount(Value::createLong(count));
+ pDocument->addField(countName, pCount);
+
+ return Value::createDocument(pDocument);
+ }
+
+ AccumulatorAvg::AccumulatorAvg(
+ const intrusive_ptr<ExpressionContext> &pTheCtx):
+ AccumulatorSum(),
+ count(0),
+ pCtx(pTheCtx) {
+ }
+
+ const char *AccumulatorAvg::getOpName() const {
+ return "$avg";
+ }
+}
diff --git a/src/mongo/db/pipeline/accumulator_first.cpp b/src/mongo/db/pipeline/accumulator_first.cpp
new file mode 100755
index 00000000000..c947aa83996
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_first.cpp
@@ -0,0 +1,49 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+ intrusive_ptr<const Value> AccumulatorFirst::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ assert(vpOperand.size() == 1);
+
+ /* only remember the first value seen */
+ if (!pValue.get())
+ pValue = vpOperand[0]->evaluate(pDocument);
+
+ return pValue;
+ }
+
+ AccumulatorFirst::AccumulatorFirst():
+ AccumulatorSingleValue() {
+ }
+
+ intrusive_ptr<Accumulator> AccumulatorFirst::create(
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ intrusive_ptr<AccumulatorFirst> pAccumulator(
+ new AccumulatorFirst());
+ return pAccumulator;
+ }
+
+ const char *AccumulatorFirst::getOpName() const {
+ return "$first";
+ }
+}
diff --git a/src/mongo/db/pipeline/accumulator_last.cpp b/src/mongo/db/pipeline/accumulator_last.cpp
new file mode 100755
index 00000000000..c134fc83159
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_last.cpp
@@ -0,0 +1,48 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+ intrusive_ptr<const Value> AccumulatorLast::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ assert(vpOperand.size() == 1);
+
+ /* always remember the last value seen */
+ pValue = vpOperand[0]->evaluate(pDocument);
+
+ return pValue;
+ }
+
+ AccumulatorLast::AccumulatorLast():
+ AccumulatorSingleValue() {
+ }
+
+ intrusive_ptr<Accumulator> AccumulatorLast::create(
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ intrusive_ptr<AccumulatorLast> pAccumulator(
+ new AccumulatorLast());
+ return pAccumulator;
+ }
+
+ const char *AccumulatorLast::getOpName() const {
+ return "$last";
+ }
+}
diff --git a/src/mongo/db/pipeline/accumulator_min_max.cpp b/src/mongo/db/pipeline/accumulator_min_max.cpp
new file mode 100755
index 00000000000..6f078187b44
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_min_max.cpp
@@ -0,0 +1,67 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+ intrusive_ptr<const Value> AccumulatorMinMax::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ assert(vpOperand.size() == 1);
+ intrusive_ptr<const Value> prhs(vpOperand[0]->evaluate(pDocument));
+
+ /* if this is the first value, just use it */
+ if (!pValue.get())
+ pValue = prhs;
+ else {
+ /* compare with the current value; swap if appropriate */
+ int cmp = Value::compare(pValue, prhs) * sense;
+ if (cmp > 0)
+ pValue = prhs;
+ }
+
+ return pValue;
+ }
+
+ AccumulatorMinMax::AccumulatorMinMax(int theSense):
+ AccumulatorSingleValue(),
+ sense(theSense) {
+ assert((sense == 1) || (sense == -1));
+ }
+
+ intrusive_ptr<Accumulator> AccumulatorMinMax::createMin(
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ intrusive_ptr<AccumulatorMinMax> pAccumulator(
+ new AccumulatorMinMax(1));
+ return pAccumulator;
+ }
+
+ intrusive_ptr<Accumulator> AccumulatorMinMax::createMax(
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ intrusive_ptr<AccumulatorMinMax> pAccumulator(
+ new AccumulatorMinMax(-1));
+ return pAccumulator;
+ }
+
+ const char *AccumulatorMinMax::getOpName() const {
+ if (sense == 1)
+ return "$min";
+ return "$max";
+ }
+}
diff --git a/src/mongo/db/pipeline/accumulator_push.cpp b/src/mongo/db/pipeline/accumulator_push.cpp
new file mode 100755
index 00000000000..2640bc4ecfd
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_push.cpp
@@ -0,0 +1,73 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+ intrusive_ptr<const Value> AccumulatorPush::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ assert(vpOperand.size() == 1);
+ intrusive_ptr<const Value> prhs(vpOperand[0]->evaluate(pDocument));
+
+ if (prhs->getType() == Undefined)
+ ; /* nothing to add to the array */
+ else if (!pCtx->getInRouter())
+ vpValue.push_back(prhs);
+ else {
+ /*
+ If we're in the router, we need to take apart the arrays we
+ receive and put their elements into the array we are collecting.
+ If we didn't, then we'd get an array of arrays, with one array
+ from each shard that responds.
+ */
+ assert(prhs->getType() == Array);
+
+ intrusive_ptr<ValueIterator> pvi(prhs->getArray());
+ while(pvi->more()) {
+ intrusive_ptr<const Value> pElement(pvi->next());
+ vpValue.push_back(pElement);
+ }
+ }
+
+ return Value::getNull();
+ }
+
+ intrusive_ptr<const Value> AccumulatorPush::getValue() const {
+ return Value::createArray(vpValue);
+ }
+
+ AccumulatorPush::AccumulatorPush(
+ const intrusive_ptr<ExpressionContext> &pTheCtx):
+ Accumulator(),
+ vpValue(),
+ pCtx(pTheCtx) {
+ }
+
+ intrusive_ptr<Accumulator> AccumulatorPush::create(
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ intrusive_ptr<AccumulatorPush> pAccumulator(
+ new AccumulatorPush(pCtx));
+ return pAccumulator;
+ }
+
+ const char *AccumulatorPush::getOpName() const {
+ return "$push";
+ }
+}
diff --git a/src/mongo/db/pipeline/accumulator_single_value.cpp b/src/mongo/db/pipeline/accumulator_single_value.cpp
new file mode 100755
index 00000000000..bfec80387d3
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_single_value.cpp
@@ -0,0 +1,32 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+ intrusive_ptr<const Value> AccumulatorSingleValue::getValue() const {
+ return pValue;
+ }
+
+ AccumulatorSingleValue::AccumulatorSingleValue():
+ pValue(intrusive_ptr<const Value>()) {
+ }
+
+}
diff --git a/src/mongo/db/pipeline/accumulator_sum.cpp b/src/mongo/db/pipeline/accumulator_sum.cpp
new file mode 100755
index 00000000000..e6526ac254a
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_sum.cpp
@@ -0,0 +1,74 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+ intrusive_ptr<const Value> AccumulatorSum::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ assert(vpOperand.size() == 1);
+ intrusive_ptr<const Value> prhs(vpOperand[0]->evaluate(pDocument));
+
+ /* upgrade to the widest type required to hold the result */
+ totalType = Value::getWidestNumeric(totalType, prhs->getType());
+
+ if (totalType == NumberInt) {
+ int v = prhs->coerceToInt();
+ longTotal += v;
+ doubleTotal += v;
+ }
+ else if (totalType == NumberLong) {
+ long long v = prhs->coerceToLong();
+ longTotal += v;
+ doubleTotal += v;
+ }
+ else { /* (totalType == NumberDouble) */
+ double v = prhs->coerceToDouble();
+ doubleTotal += v;
+ }
+
+ return Value::getZero();
+ }
+
+ intrusive_ptr<Accumulator> AccumulatorSum::create(
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ intrusive_ptr<AccumulatorSum> pSummer(new AccumulatorSum());
+ return pSummer;
+ }
+
+ intrusive_ptr<const Value> AccumulatorSum::getValue() const {
+ if (totalType == NumberInt)
+ return Value::createInt((int)longTotal);
+ if (totalType == NumberLong)
+ return Value::createLong(longTotal);
+ return Value::createDouble(doubleTotal);
+ }
+
+ AccumulatorSum::AccumulatorSum():
+ Accumulator(),
+ totalType(NumberInt),
+ longTotal(0),
+ doubleTotal(0) {
+ }
+
+ const char *AccumulatorSum::getOpName() const {
+ return "$sum";
+ }
+}
diff --git a/src/mongo/db/pipeline/builder.cpp b/src/mongo/db/pipeline/builder.cpp
new file mode 100755
index 00000000000..cbde3705656
--- /dev/null
+++ b/src/mongo/db/pipeline/builder.cpp
@@ -0,0 +1,117 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/builder.h"
+
+
+namespace mongo {
+
+ void BuilderObj::append() {
+ pBuilder->appendNull(fieldName);
+ }
+
+ void BuilderObj::append(bool b) {
+ pBuilder->append(fieldName, b);
+ }
+
+ void BuilderObj::append(int i) {
+ pBuilder->append(fieldName, i);
+ }
+
+ void BuilderObj::append(long long ll) {
+ pBuilder->append(fieldName, ll);
+ }
+
+ void BuilderObj::append(double d) {
+ pBuilder->append(fieldName, d);
+ }
+
+ void BuilderObj::append(string s) {
+ pBuilder->append(fieldName, s);
+ }
+
+ void BuilderObj::append(const OID &o) {
+ pBuilder->append(fieldName, o);
+ }
+
+ void BuilderObj::append(const Date_t &d) {
+ pBuilder->append(fieldName, d);
+ }
+
+ void BuilderObj::append(BSONObjBuilder *pDone) {
+ pBuilder->append(fieldName, pDone->done());
+ }
+
+ void BuilderObj::append(BSONArrayBuilder *pDone) {
+ pBuilder->append(fieldName, pDone->arr());
+ }
+
+ BuilderObj::BuilderObj(
+ BSONObjBuilder *pObjBuilder, string theFieldName):
+ pBuilder(pObjBuilder),
+ fieldName(theFieldName) {
+ }
+
+
+ void BuilderArray::append() {
+ pBuilder->appendNull();
+ }
+
+ void BuilderArray::append(bool b) {
+ pBuilder->append(b);
+ }
+
+ void BuilderArray::append(int i) {
+ pBuilder->append(i);
+ }
+
+ void BuilderArray::append(long long ll) {
+ pBuilder->append(ll);
+ }
+
+ void BuilderArray::append(double d) {
+ pBuilder->append(d);
+ }
+
+ void BuilderArray::append(string s) {
+ pBuilder->append(s);
+ }
+
+ void BuilderArray::append(const OID &o) {
+ pBuilder->append(o);
+ }
+
+ void BuilderArray::append(const Date_t &d) {
+ pBuilder->append(d);
+ }
+
+ void BuilderArray::append(BSONObjBuilder *pDone) {
+ pBuilder->append(pDone->done());
+ }
+
+ void BuilderArray::append(BSONArrayBuilder *pDone) {
+ pBuilder->append(pDone->arr());
+ }
+
+ BuilderArray::BuilderArray(
+ BSONArrayBuilder *pArrayBuilder):
+ pBuilder(pArrayBuilder) {
+ }
+
+}
diff --git a/src/mongo/db/pipeline/builder.h b/src/mongo/db/pipeline/builder.h
new file mode 100755
index 00000000000..bdf71cd784c
--- /dev/null
+++ b/src/mongo/db/pipeline/builder.h
@@ -0,0 +1,95 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+namespace mongo {
+
+ class BSONArrayBuilder;
+ class BSONObjBuilder;
+
+ /*
+ Generic Builder.
+
+ The methods to append items to an object (on BSONObjBuilder) and an array
+ (on BSONArrayBuilder) differ only by their inclusion of a field name.
+ For more complicated implementations of addToBsonObj() and
+ addToBsonArray(), it makes sense to abstract that out and use
+ this generic builder that always looks the same, and then implement
+ addToBsonObj() and addToBsonArray() by using a common method.
+ */
+ class Builder :
+ boost::noncopyable {
+ public:
+ virtual ~Builder() {};
+
+ virtual void append() = 0; // append a null
+ virtual void append(bool b) = 0;
+ virtual void append(int i) = 0;
+ virtual void append(long long ll) = 0;
+ virtual void append(double d) = 0;
+ virtual void append(string s) = 0;
+ virtual void append(const OID &o) = 0;
+ virtual void append(const Date_t &d) = 0;
+ virtual void append(BSONObjBuilder *pDone) = 0;
+ virtual void append(BSONArrayBuilder *pDone) = 0;
+ };
+
+ class BuilderObj :
+ public Builder {
+ public:
+ // virtuals from Builder
+ virtual void append();
+ virtual void append(bool b);
+ virtual void append(int i);
+ virtual void append(long long ll);
+ virtual void append(double d);
+ virtual void append(string s);
+ virtual void append(const OID &o);
+ virtual void append(const Date_t &d);
+ virtual void append(BSONObjBuilder *pDone);
+ virtual void append(BSONArrayBuilder *pDone);
+
+ BuilderObj(BSONObjBuilder *pBuilder, string fieldName);
+
+ private:
+ BSONObjBuilder *pBuilder;
+ string fieldName;
+ };
+
+ class BuilderArray :
+ public Builder {
+ public:
+ // virtuals from Builder
+ virtual void append();
+ virtual void append(bool b);
+ virtual void append(int i);
+ virtual void append(long long ll);
+ virtual void append(double d);
+ virtual void append(string s);
+ virtual void append(const OID &o);
+ virtual void append(const Date_t &d);
+ virtual void append(BSONObjBuilder *pDone);
+ virtual void append(BSONArrayBuilder *pDone);
+
+ BuilderArray(BSONArrayBuilder *pBuilder);
+
+ private:
+ BSONArrayBuilder *pBuilder;
+ };
+}
diff --git a/src/mongo/db/pipeline/doc_mem_monitor.cpp b/src/mongo/db/pipeline/doc_mem_monitor.cpp
new file mode 100755
index 00000000000..ffbe9c88854
--- /dev/null
+++ b/src/mongo/db/pipeline/doc_mem_monitor.cpp
@@ -0,0 +1,68 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/doc_mem_monitor.h"
+#include "util/systeminfo.h"
+
+namespace mongo {
+
+ DocMemMonitor::DocMemMonitor(StringWriter *pW) {
+ /*
+ Use the default values.
+
+ Currently, we warn in log at 5%, and assert at 10%.
+ */
+ size_t errorRam = SystemInfo::getPhysicalRam() / 10;
+ size_t warnRam = errorRam / 2;
+
+ init(pW, warnRam, errorRam);
+ }
+
+ DocMemMonitor::DocMemMonitor(StringWriter *pW,
+ size_t warnLimit, size_t errorLimit) {
+ init(pW, warnLimit, errorLimit);
+ }
+
+ void DocMemMonitor::addToTotal(size_t amount) {
+ totalUsed += amount;
+
+ if (!warned) {
+ if (warnLimit && (totalUsed > warnLimit)) {
+ stringstream ss;
+ ss << "warning, 5% of physical RAM used for ";
+ pWriter->writeString(ss);
+ ss << endl;
+ warning() << ss.str();
+ warned = true;
+ }
+ }
+
+ if (errorLimit) {
+ uassert(15944, "terminating request: request heap use exceeded 10% of physical RAM", (totalUsed <= errorLimit));
+ }
+ }
+
+ void DocMemMonitor::init(StringWriter *pW,
+ size_t warnLimit, size_t errorLimit) {
+ this->pWriter = pW;
+ this->warnLimit = warnLimit;
+ this->errorLimit = errorLimit;
+
+ warned = false;
+ totalUsed = 0;
+ }
+}
diff --git a/src/mongo/db/pipeline/doc_mem_monitor.h b/src/mongo/db/pipeline/doc_mem_monitor.h
new file mode 100755
index 00000000000..e368acc906a
--- /dev/null
+++ b/src/mongo/db/pipeline/doc_mem_monitor.h
@@ -0,0 +1,94 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+#include "util/string_writer.h"
+
+
+namespace mongo {
+
+ /*
+ This utility class provides an easy way to total up, monitor, warn, and
+ signal an error when the amount of memory used for an operation exceeds
+ given thresholds.
+
+ Create a local instance of this class, and then inform it of any memory
+ that you consume using addToTotal().
+
+ Warnings or errors are issued as usage exceeds certain fractions of
+ physical memory on the host, as determined by SystemInfo.
+
+ This class is not guaranteed to warn or signal errors if the host system
+ does not support the ability to report its memory, as per the warnings
+ for SystemInfo in systeminfo.h.
+ */
+ class DocMemMonitor {
+ public:
+ /*
+ Constructor.
+
+ Uses default limits for warnings and errors.
+
+ The StringWriter parameter must outlive the DocMemMonitor instance.
+
+ @param pWriter string writer that provides information about the
+ operation being monitored
+ */
+ DocMemMonitor(StringWriter *pWriter);
+
+ /*
+ Constructor.
+
+ This variant allows explicit selection of the limits. Note that
+ limits of zero are treated as infinite.
+
+ The StringWriter parameter must outlive the DocMemMonitor instance.
+
+ @param pWriter string writer that provides information about the
+ operation being monitored
+ @param warnLimit the amount of ram to issue (log) a warning for
+ @param errorLimit the amount of ram to throw an error for
+ */
+ DocMemMonitor(StringWriter *pWriter, size_t warnLimit,
+ size_t errorLimit);
+
+ /*
+ Increment the total amount of memory used by the given amount. If
+ the warning threshold is exceeded, a warning will be logged. If the
+ error threshold is exceeded, an error will be thrown.
+
+ @param amount the amount of memory to add to the current total
+ */
+ void addToTotal(size_t amount);
+
+ private:
+ /*
+ Real constructor body.
+
+ Provides common construction for all the variant constructors.
+ */
+ void init(StringWriter *pW, size_t warnLimit, size_t errorLimit);
+
+ bool warned;
+ size_t totalUsed;
+ size_t warnLimit;
+ size_t errorLimit;
+ StringWriter *pWriter;
+ };
+
+}
diff --git a/src/mongo/db/pipeline/document.cpp b/src/mongo/db/pipeline/document.cpp
new file mode 100755
index 00000000000..a49c7e303c1
--- /dev/null
+++ b/src/mongo/db/pipeline/document.cpp
@@ -0,0 +1,219 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/jsobj.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/value.h"
+#include "util/mongoutils/str.h"
+
+namespace mongo {
+ using namespace mongoutils;
+
+ string Document::idName("_id");
+
+ intrusive_ptr<Document> Document::createFromBsonObj(BSONObj *pBsonObj) {
+ intrusive_ptr<Document> pDocument(new Document(pBsonObj));
+ return pDocument;
+ }
+
+ Document::Document(BSONObj *pBsonObj):
+ vFieldName(),
+ vpValue() {
+ BSONObjIterator bsonIterator(pBsonObj->begin());
+ while(bsonIterator.more()) {
+ BSONElement bsonElement(bsonIterator.next());
+ string fieldName(bsonElement.fieldName());
+ intrusive_ptr<const Value> pValue(
+ Value::createFromBsonElement(&bsonElement));
+
+ vFieldName.push_back(fieldName);
+ vpValue.push_back(pValue);
+ }
+ }
+
+ void Document::toBson(BSONObjBuilder *pBuilder) {
+ const size_t n = vFieldName.size();
+ for(size_t i = 0; i < n; ++i)
+ vpValue[i]->addToBsonObj(pBuilder, vFieldName[i]);
+ }
+
+ intrusive_ptr<Document> Document::create(size_t sizeHint) {
+ intrusive_ptr<Document> pDocument(new Document(sizeHint));
+ return pDocument;
+ }
+
+ Document::Document(size_t sizeHint):
+ vFieldName(),
+ vpValue() {
+ if (sizeHint) {
+ vFieldName.reserve(sizeHint);
+ vpValue.reserve(sizeHint);
+ }
+ }
+
+ intrusive_ptr<Document> Document::clone() {
+ const size_t n = vFieldName.size();
+ intrusive_ptr<Document> pNew(Document::create(n));
+ for(size_t i = 0; i < n; ++i)
+ pNew->addField(vFieldName[i], vpValue[i]);
+
+ return pNew;
+ }
+
+ Document::~Document() {
+ }
+
+ FieldIterator *Document::createFieldIterator() {
+ return new FieldIterator(intrusive_ptr<Document>(this));
+ }
+
+ intrusive_ptr<const Value> Document::getValue(const string &fieldName) {
+ /*
+ For now, assume the number of fields is small enough that iteration
+ is ok. Later, if this gets large, we can create a map into the
+ vector for these lookups.
+
+ Note that because of the schema-less nature of this data, we always
+ have to look, and can't assume that the requested field is always
+ in a particular place as we would with a statically compilable
+ reference.
+ */
+ const size_t n = vFieldName.size();
+ for(size_t i = 0; i < n; ++i) {
+ if (fieldName.compare(vFieldName[i]) == 0)
+ return vpValue[i];
+ }
+
+ return(intrusive_ptr<const Value>());
+ }
+
+ void Document::addField(const string &fieldName,
+ const intrusive_ptr<const Value> &pValue) {
+ uassert(15945, str::stream() << "cannot add undefined field " <<
+ fieldName << " to document", pValue->getType() != Undefined);
+
+ vFieldName.push_back(fieldName);
+ vpValue.push_back(pValue);
+ }
+
+ void Document::setField(size_t index,
+ const string &fieldName,
+ const intrusive_ptr<const Value> &pValue) {
+ /* special case: should this field be removed? */
+ if (!pValue.get()) {
+ vFieldName.erase(vFieldName.begin() + index);
+ vpValue.erase(vpValue.begin() + index);
+ return;
+ }
+
+ /* make sure we have a valid value */
+ uassert(15968, str::stream() << "cannot set undefined field " <<
+ fieldName << " to document", pValue->getType() != Undefined);
+
+ /* set the indicated field */
+ vFieldName[index] = fieldName;
+ vpValue[index] = pValue;
+ }
+
+ intrusive_ptr<const Value> Document::getField(const string &fieldName) const {
+ const size_t n = vFieldName.size();
+ for(size_t i = 0; i < n; ++i) {
+ if (fieldName.compare(vFieldName[i]) == 0)
+ return vpValue[i];
+ }
+
+ /* if we got here, there's no such field */
+ return intrusive_ptr<const Value>();
+ }
+
+ size_t Document::getApproximateSize() const {
+ size_t size = sizeof(Document);
+ const size_t n = vpValue.size();
+ for(size_t i = 0; i < n; ++i)
+ size += vpValue[i]->getApproximateSize();
+
+ return size;
+ }
+
+ size_t Document::getFieldIndex(const string &fieldName) const {
+ const size_t n = vFieldName.size();
+ size_t i = 0;
+ for(; i < n; ++i) {
+ if (fieldName.compare(vFieldName[i]) == 0)
+ break;
+ }
+
+ return i;
+ }
+
+ void Document::hash_combine(size_t &seed) const {
+ const size_t n = vFieldName.size();
+ for(size_t i = 0; i < n; ++i) {
+ boost::hash_combine(seed, vFieldName[i]);
+ vpValue[i]->hash_combine(seed);
+ }
+ }
+
+ int Document::compare(const intrusive_ptr<Document> &rL,
+ const intrusive_ptr<Document> &rR) {
+ const size_t lSize = rL->vFieldName.size();
+ const size_t rSize = rR->vFieldName.size();
+
+ for(size_t i = 0; true; ++i) {
+ if (i >= lSize) {
+ if (i >= rSize)
+ return 0; // documents are the same length
+
+ return -1; // left document is shorter
+ }
+
+ if (i >= rSize)
+ return 1; // right document is shorter
+
+ const int nameCmp = rL->vFieldName[i].compare(rR->vFieldName[i]);
+ if (nameCmp)
+ return nameCmp; // field names are unequal
+
+ const int valueCmp = Value::compare(rL->vpValue[i], rR->vpValue[i]);
+ if (valueCmp)
+ return valueCmp; // fields are unequal
+ }
+
+ /* NOTREACHED */
+ assert(false);
+ return 0;
+ }
+
+ /* ----------------------- FieldIterator ------------------------------- */
+
+ FieldIterator::FieldIterator(const intrusive_ptr<Document> &pTheDocument):
+ pDocument(pTheDocument),
+ index(0) {
+ }
+
+ bool FieldIterator::more() const {
+ return (index < pDocument->vFieldName.size());
+ }
+
+ pair<string, intrusive_ptr<const Value> > FieldIterator::next() {
+ assert(more());
+ pair<string, intrusive_ptr<const Value> > result(
+ pDocument->vFieldName[index], pDocument->vpValue[index]);
+ ++index;
+ return result;
+ }
+}
diff --git a/src/mongo/db/pipeline/document.h b/src/mongo/db/pipeline/document.h
new file mode 100755
index 00000000000..f11a825151e
--- /dev/null
+++ b/src/mongo/db/pipeline/document.h
@@ -0,0 +1,246 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+#include "util/intrusive_counter.h"
+
+namespace mongo {
+ class BSONObj;
+ class FieldIterator;
+ class Value;
+
+ class Document :
+ public IntrusiveCounterUnsigned {
+ public:
+ ~Document();
+
+ /*
+ Create a new Document from the given BSONObj.
+
+ Document field values may be pointed to in the BSONObj, so it
+ must live at least as long as the resulting Document.
+
+ @returns shared pointer to the newly created Document
+ */
+ static intrusive_ptr<Document> createFromBsonObj(BSONObj *pBsonObj);
+
+ /*
+ Create a new empty Document.
+
+ @param sizeHint a hint at what the number of fields will be; if
+ known, this can be used to increase memory allocation efficiency
+ @returns shared pointer to the newly created Document
+ */
+ static intrusive_ptr<Document> create(size_t sizeHint = 0);
+
+ /*
+ Clone a document.
+
+ The new document shares all the fields' values with the original.
+
+ This is not a deep copy. Only the fields on the top-level document
+ are cloned.
+
+ @returns the shallow clone of the document
+ */
+ intrusive_ptr<Document> clone();
+
+ /*
+ Add this document to the BSONObj under construction with the
+ given BSONObjBuilder.
+ */
+ void toBson(BSONObjBuilder *pBsonObjBuilder);
+
+ /*
+ Create a new FieldIterator that can be used to examine the
+ Document's fields.
+ */
+ FieldIterator *createFieldIterator();
+
+ /*
+ Get the value of the specified field.
+
+ @param fieldName the name of the field
+ @return point to the requested field
+ */
+ intrusive_ptr<const Value> getValue(const string &fieldName);
+
+ /*
+ Add the given field to the Document.
+
+ BSON documents' fields are ordered; the new Field will be
+ appened to the current list of fields.
+
+ It is an error to add a field that has the same name as another
+ field.
+ */
+ void addField(const string &fieldName,
+ const intrusive_ptr<const Value> &pValue);
+
+ /*
+ Set the given field to be at the specified position in the
+ Document. This will replace any field that is currently in that
+ position. The index must be within the current range of field
+ indices.
+
+ pValue.get() may be NULL, in which case the field will be
+ removed. fieldName is ignored in this case.
+
+ @param index the field index in the list of fields
+ @param fieldName the new field name
+ @param pValue the new Value
+ */
+ void setField(size_t index,
+ const string &fieldName,
+ const intrusive_ptr<const Value> &pValue);
+
+ /*
+ Convenience type for dealing with fields.
+ */
+ typedef pair<string, intrusive_ptr<const Value> > FieldPair;
+
+ /*
+ Get the indicated field.
+
+ @param index the field index in the list of fields
+ @returns the field name and value of the field
+ */
+ FieldPair getField(size_t index) const;
+
+ /*
+ Get the number of fields in the Document.
+
+ @returns the number of fields in the Document
+ */
+ size_t getFieldCount() const;
+
+ /*
+ Get the index of the given field.
+
+ @param fieldName the name of the field
+ @returns the index of the field, or if it does not exist, the number
+ of fields (getFieldCount())
+ */
+ size_t getFieldIndex(const string &fieldName) const;
+
+ /*
+ Get a field by name.
+
+ @param fieldName the name of the field
+ @returns the value of the field
+ */
+ intrusive_ptr<const Value> getField(const string &fieldName) const;
+
+ /*
+ Get the approximate storage size of the document, in bytes.
+
+ Under the assumption that field name strings are shared, they are
+ not included in the total.
+
+ @returns the approximate storage
+ */
+ size_t getApproximateSize() const;
+
+ /*
+ Compare two documents.
+
+ BSON document field order is significant, so this just goes through
+ the fields in order. The comparison is done in roughly the same way
+ as strings are compared, but comparing one field at a time instead
+ of one character at a time.
+ */
+ static int compare(const intrusive_ptr<Document> &rL,
+ const intrusive_ptr<Document> &rR);
+
+ static string idName; // shared "_id"
+
+ /*
+ Calculate a hash value.
+
+ Meant to be used to create composite hashes suitable for
+ boost classes such as unordered_map<>.
+
+ @param seed value to augment with this' hash
+ */
+ void hash_combine(size_t &seed) const;
+
+ private:
+ friend class FieldIterator;
+
+ Document(size_t sizeHint);
+ Document(BSONObj *pBsonObj);
+
+ /* these two vectors parallel each other */
+ vector<string> vFieldName;
+ vector<intrusive_ptr<const Value> > vpValue;
+ };
+
+
+ class FieldIterator :
+ boost::noncopyable {
+ public:
+ /*
+ Ask if there are more fields to return.
+
+ @return true if there are more fields, false otherwise
+ */
+ bool more() const;
+
+ /*
+ Move the iterator to point to the next field and return it.
+
+ @return the next field's <name, Value>
+ */
+ Document::FieldPair next();
+
+ private:
+ friend class Document;
+
+ /*
+ Constructor.
+
+ @param pDocument points to the document whose fields are being
+ iterated
+ */
+ FieldIterator(const intrusive_ptr<Document> &pDocument);
+
+ /*
+ We'll hang on to the original document to ensure we keep the
+ fieldPtr vector alive.
+ */
+ intrusive_ptr<Document> pDocument;
+ size_t index; // current field in iteration
+ };
+}
+
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+ inline size_t Document::getFieldCount() const {
+ return vFieldName.size();
+ }
+
+ inline Document::FieldPair Document::getField(size_t index) const {
+ assert( index < vFieldName.size() );
+ return FieldPair(vFieldName[index], vpValue[index]);
+ }
+
+}
diff --git a/src/mongo/db/pipeline/document_source.cpp b/src/mongo/db/pipeline/document_source.cpp
new file mode 100755
index 00000000000..813852e35c6
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source.cpp
@@ -0,0 +1,52 @@
+/**
+* Copyright (C) 2011 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+namespace mongo {
+ DocumentSource::~DocumentSource() {
+ }
+
+ void DocumentSource::setSource(
+ const intrusive_ptr<DocumentSource> &pTheSource) {
+ assert(!pSource.get());
+ pSource = pTheSource;
+ }
+
+ bool DocumentSource::coalesce(
+ const intrusive_ptr<DocumentSource> &pNextSource) {
+ return false;
+ }
+
+ void DocumentSource::optimize() {
+ }
+
+ void DocumentSource::addToBsonArray(BSONArrayBuilder *pBuilder) const {
+ BSONObjBuilder insides;
+ sourceToBson(&insides);
+ pBuilder->append(insides.done());
+ }
+
+ void DocumentSource::writeString(stringstream &ss) const {
+ BSONArrayBuilder bab;
+ addToBsonArray(&bab);
+ BSONArray ba(bab.arr());
+ ss << ba.toString(/* isArray */true);
+ // our toString should use standard string types.....
+ }
+}
diff --git a/src/mongo/db/pipeline/document_source.h b/src/mongo/db/pipeline/document_source.h
new file mode 100755
index 00000000000..8d5f0f70847
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source.h
@@ -0,0 +1,985 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+#include <boost/unordered_map.hpp>
+#include "util/intrusive_counter.h"
+#include "client/parallel.h"
+#include "db/jsobj.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/value.h"
+#include "util/string_writer.h"
+
+namespace mongo {
+ class Accumulator;
+ class Cursor;
+ class Document;
+ class Expression;
+ class ExpressionContext;
+ class ExpressionFieldPath;
+ class ExpressionObject;
+ class Matcher;
+
+ class DocumentSource :
+ public IntrusiveCounterUnsigned,
+ public StringWriter {
+ public:
+ virtual ~DocumentSource();
+
+ // virtuals from StringWriter
+ /*
+ Write out a string representation of this pipeline operator.
+
+ @param ss string stream to write the string representation to
+ */
+ virtual void writeString(stringstream &ss) const;
+
+
+ /*
+ Is the source at EOF?
+
+ @returns true if the source has no more Documents to return.
+ */
+ virtual bool eof() = 0;
+
+ /*
+ Advance the state of the DocumentSource so that it will return the
+ next Document.
+
+ @returns whether there is another document to fetch, i.e., whether or
+ not getCurrent() will succeed.
+ */
+ virtual bool advance() = 0;
+
+ /*
+ Advance the source, and return the next Expression.
+
+ @returns the current Document
+ TODO throws an exception if there are no more expressions to return.
+ */
+ virtual intrusive_ptr<Document> getCurrent() = 0;
+
+ /*
+ Set the underlying source this source should use to get Documents
+ from.
+
+ It is an error to set the source more than once. This is to
+ prevent changing sources once the original source has been started;
+ this could break the state maintained by the DocumentSource.
+
+ @param pSource the underlying source to use
+ */
+ virtual void setSource(const intrusive_ptr<DocumentSource> &pSource);
+
+ /*
+ Attempt to coalesce this DocumentSource with its successor in the
+ document processing pipeline. If successful, the successor
+ DocumentSource should be removed from the pipeline and discarded.
+
+ If successful, this operation can be applied repeatedly, in an
+ attempt to coalesce several sources together.
+
+ The default implementation is to do nothing, and return false.
+
+ @param pNextSource the next source in the document processing chain.
+ @returns whether or not the attempt to coalesce was successful or not;
+ if the attempt was not successful, nothing has been changed
+ */
+ virtual bool coalesce(const intrusive_ptr<DocumentSource> &pNextSource);
+
+ /*
+ Optimize the pipeline operation, if possible. This is a local
+ optimization that only looks within this DocumentSource. For best
+ results, first coalesce compatible sources using coalesce().
+
+ This is intended for any operations that include expressions, and
+ provides a hook for those to optimize those operations.
+
+ The default implementation is to do nothing.
+ */
+ virtual void optimize();
+
+ /*
+ Add the DocumentSource to the array builder.
+
+ The default implementation calls sourceToBson() in order to
+ convert the inner part of the object which will be added to the
+ array being built here.
+
+ @param pBuilder the array builder to add the operation to.
+ */
+ virtual void addToBsonArray(BSONArrayBuilder *pBuilder) const;
+
+ protected:
+ /*
+ Create an object that represents the document source. The object
+ will have a single field whose name is the source's name. This
+ will be used by the default implementation of addToBsonArray()
+ to add this object to a pipeline being represented in BSON.
+
+ @param pBuilder a blank object builder to write to
+ */
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const = 0;
+
+ /*
+ Most DocumentSources have an underlying source they get their data
+ from. This is a convenience for them.
+
+ The default implementation of setSource() sets this; if you don't
+ need a source, override that to assert(). The default is to
+ assert() if this has already been set.
+ */
+ intrusive_ptr<DocumentSource> pSource;
+ };
+
+
+ class DocumentSourceBsonArray :
+ public DocumentSource {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceBsonArray();
+ virtual bool eof();
+ virtual bool advance();
+ virtual intrusive_ptr<Document> getCurrent();
+ virtual void setSource(const intrusive_ptr<DocumentSource> &pSource);
+
+ /*
+ Create a document source based on a BSON array.
+
+ This is usually put at the beginning of a chain of document sources
+ in order to fetch data from the database.
+
+ CAUTION: the BSON is not read until the source is used. Any
+ elements that appear after these documents must not be read until
+ this source is exhausted.
+
+ @param pBsonElement the BSON array to treat as a document source
+ @returns the newly created document source
+ */
+ static intrusive_ptr<DocumentSourceBsonArray> create(
+ BSONElement *pBsonElement);
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ private:
+ DocumentSourceBsonArray(BSONElement *pBsonElement);
+
+ BSONObj embeddedObject;
+ BSONObjIterator arrayIterator;
+ BSONElement currentElement;
+ bool haveCurrent;
+ };
+
+
+ class DocumentSourceCommandFutures :
+ public DocumentSource {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceCommandFutures();
+ virtual bool eof();
+ virtual bool advance();
+ virtual intrusive_ptr<Document> getCurrent();
+ virtual void setSource(const intrusive_ptr<DocumentSource> &pSource);
+
+ /* convenient shorthand for a commonly used type */
+ typedef list<shared_ptr<Future::CommandResult> > FuturesList;
+
+ /*
+ Create a DocumentSource that wraps a list of Command::Futures.
+
+ @param errmsg place to write error messages to; must exist for the
+ lifetime of the created DocumentSourceCommandFutures
+ @param pList the list of futures
+ */
+ static intrusive_ptr<DocumentSourceCommandFutures> create(
+ string &errmsg, FuturesList *pList);
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ private:
+ DocumentSourceCommandFutures(string &errmsg, FuturesList *pList);
+
+ /*
+ Advance to the next document, setting pCurrent appropriately.
+
+ Adjusts pCurrent, pBsonSource, and iterator, as needed. On exit,
+ pCurrent is the Document to return, or NULL. If NULL, this
+ indicates there is nothing more to return.
+ */
+ void getNextDocument();
+
+ bool newSource; // set to true for the first item of a new source
+ intrusive_ptr<DocumentSourceBsonArray> pBsonSource;
+ intrusive_ptr<Document> pCurrent;
+ FuturesList::iterator iterator;
+ FuturesList::iterator listEnd;
+ string &errmsg;
+ };
+
+
+ class DocumentSourceCursor :
+ public DocumentSource {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceCursor();
+ virtual bool eof();
+ virtual bool advance();
+ virtual intrusive_ptr<Document> getCurrent();
+ virtual void setSource(const intrusive_ptr<DocumentSource> &pSource);
+
+ /*
+ Create a document source based on a cursor.
+
+ This is usually put at the beginning of a chain of document sources
+ in order to fetch data from the database.
+
+ @param pCursor the cursor to use to fetch data
+ */
+ static intrusive_ptr<DocumentSourceCursor> create(
+ const shared_ptr<Cursor> &pCursor);
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ private:
+ DocumentSourceCursor(const shared_ptr<Cursor> &pTheCursor);
+
+ void findNext();
+ shared_ptr<Cursor> pCursor;
+ intrusive_ptr<Document> pCurrent;
+ };
+
+
+ /*
+ This contains all the basic mechanics for filtering a stream of
+ Documents, except for the actual predicate evaluation itself. This was
+ factored out so we could create DocumentSources that use both Matcher
+ style predicates as well as full Expressions.
+ */
+ class DocumentSourceFilterBase :
+ public DocumentSource {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceFilterBase();
+ virtual bool eof();
+ virtual bool advance();
+ virtual intrusive_ptr<Document> getCurrent();
+
+ /*
+ Create a BSONObj suitable for Matcher construction.
+
+ This is used after filter analysis has moved as many filters to
+ as early a point as possible in the document processing pipeline.
+ See db/Matcher.h and the associated wiki documentation for the
+ format. This conversion is used to move back to the low-level
+ find() Cursor mechanism.
+
+ @param pBuilder the builder to write to
+ */
+ virtual void toMatcherBson(BSONObjBuilder *pBuilder) const = 0;
+
+ protected:
+ DocumentSourceFilterBase();
+
+ /*
+ Test the given document against the predicate and report if it
+ should be accepted or not.
+
+ @param pDocument the document to test
+ @returns true if the document matches the filter, false otherwise
+ */
+ virtual bool accept(const intrusive_ptr<Document> &pDocument) const = 0;
+
+ private:
+
+ void findNext();
+
+ bool unstarted;
+ bool hasNext;
+ intrusive_ptr<Document> pCurrent;
+ };
+
+
+ class DocumentSourceFilter :
+ public DocumentSourceFilterBase {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceFilter();
+ virtual bool coalesce(const intrusive_ptr<DocumentSource> &pNextSource);
+ virtual void optimize();
+
+ /*
+ Create a filter.
+
+ @param pBsonElement the raw BSON specification for the filter
+ @returns the filter
+ */
+ static intrusive_ptr<DocumentSource> createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ /*
+ Create a filter.
+
+ @param pFilter the expression to use to filter
+ @returns the filter
+ */
+ static intrusive_ptr<DocumentSourceFilter> create(
+ const intrusive_ptr<Expression> &pFilter);
+
+ /*
+ Create a BSONObj suitable for Matcher construction.
+
+ This is used after filter analysis has moved as many filters to
+ as early a point as possible in the document processing pipeline.
+ See db/Matcher.h and the associated wiki documentation for the
+ format. This conversion is used to move back to the low-level
+ find() Cursor mechanism.
+
+ @param pBuilder the builder to write to
+ */
+ void toMatcherBson(BSONObjBuilder *pBuilder) const;
+
+ static const char filterName[];
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ // virtuals from DocumentSourceFilterBase
+ virtual bool accept(const intrusive_ptr<Document> &pDocument) const;
+
+ private:
+ DocumentSourceFilter(const intrusive_ptr<Expression> &pFilter);
+
+ intrusive_ptr<Expression> pFilter;
+ };
+
+
+ class DocumentSourceGroup :
+ public DocumentSource {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceGroup();
+ virtual bool eof();
+ virtual bool advance();
+ virtual intrusive_ptr<Document> getCurrent();
+
+ /*
+ Create a new grouping DocumentSource.
+
+ @param pCtx the expression context
+ @returns the DocumentSource
+ */
+ static intrusive_ptr<DocumentSourceGroup> create(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ /*
+ Set the Id Expression.
+
+ Documents that pass through the grouping Document are grouped
+ according to this key. This will generate the id_ field in the
+ result documents.
+
+ @param pExpression the group key
+ */
+ void setIdExpression(const intrusive_ptr<Expression> &pExpression);
+
+ /*
+ Add an accumulator.
+
+ Accumulators become fields in the Documents that result from
+ grouping. Each unique group document must have it's own
+ accumulator; the accumulator factory is used to create that.
+
+ @param fieldName the name the accumulator result will have in the
+ result documents
+ @param pAccumulatorFactory used to create the accumulator for the
+ group field
+ */
+ void addAccumulator(string fieldName,
+ intrusive_ptr<Accumulator> (*pAccumulatorFactory)(
+ const intrusive_ptr<ExpressionContext> &),
+ const intrusive_ptr<Expression> &pExpression);
+
+ /*
+ Create a grouping DocumentSource from BSON.
+
+ This is a convenience method that uses the above, and operates on
+ a BSONElement that has been deteremined to be an Object with an
+ element named $group.
+
+ @param pBsonElement the BSONELement that defines the group
+ @param pCtx the expression context
+ @returns the grouping DocumentSource
+ */
+ static intrusive_ptr<DocumentSource> createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+
+ /*
+ Create a unifying group that can be used to combine group results
+ from shards.
+
+ @returns the grouping DocumentSource
+ */
+ intrusive_ptr<DocumentSource> createMerger();
+
+ static const char groupName[];
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ private:
+ DocumentSourceGroup(const intrusive_ptr<ExpressionContext> &pCtx);
+
+ /*
+ Before returning anything, this source must fetch everything from
+ the underlying source and group it. populate() is used to do that
+ on the first call to any method on this source. The populated
+ boolean indicates that this has been done.
+ */
+ void populate();
+ bool populated;
+
+ intrusive_ptr<Expression> pIdExpression;
+
+ typedef boost::unordered_map<intrusive_ptr<const Value>,
+ vector<intrusive_ptr<Accumulator> >, Value::Hash> GroupsType;
+ GroupsType groups;
+
+ /*
+ The field names for the result documents and the accumulator
+ factories for the result documents. The Expressions are the
+ common expressions used by each instance of each accumulator
+ in order to find the right-hand side of what gets added to the
+ accumulator. Note that each of those is the same for each group,
+ so we can share them across all groups by adding them to the
+ accumulators after we use the factories to make a new set of
+ accumulators for each new group.
+
+ These three vectors parallel each other.
+ */
+ vector<string> vFieldName;
+ vector<intrusive_ptr<Accumulator> (*)(
+ const intrusive_ptr<ExpressionContext> &)> vpAccumulatorFactory;
+ vector<intrusive_ptr<Expression> > vpExpression;
+
+
+ intrusive_ptr<Document> makeDocument(
+ const GroupsType::iterator &rIter);
+
+ GroupsType::iterator groupsIterator;
+ intrusive_ptr<Document> pCurrent;
+
+ intrusive_ptr<ExpressionContext> pCtx;
+ };
+
+
+ class DocumentSourceMatch :
+ public DocumentSourceFilterBase {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceMatch();
+
+ /*
+ Create a filter.
+
+ @param pBsonElement the raw BSON specification for the filter
+ @returns the filter
+ */
+ static intrusive_ptr<DocumentSource> createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ /*
+ Create a BSONObj suitable for Matcher construction.
+
+ This is used after filter analysis has moved as many filters to
+ as early a point as possible in the document processing pipeline.
+ See db/Matcher.h and the associated wiki documentation for the
+ format. This conversion is used to move back to the low-level
+ find() Cursor mechanism.
+
+ @param pBuilder the builder to write to
+ */
+ void toMatcherBson(BSONObjBuilder *pBuilder) const;
+
+ static const char matchName[];
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ // virtuals from DocumentSourceFilterBase
+ virtual bool accept(const intrusive_ptr<Document> &pDocument) const;
+
+ private:
+ DocumentSourceMatch(const BSONObj &query);
+
+ Matcher matcher;
+ };
+
+
+ class DocumentSourceOut :
+ public DocumentSource {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceOut();
+ virtual bool eof();
+ virtual bool advance();
+ virtual intrusive_ptr<Document> getCurrent();
+
+ /*
+ Create a document source for output and pass-through.
+
+ This can be put anywhere in a pipeline and will store content as
+ well as pass it on.
+
+ @returns the newly created document source
+ */
+ static intrusive_ptr<DocumentSourceOut> createFromBson(
+ BSONElement *pBsonElement);
+
+ static const char outName[];
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ private:
+ DocumentSourceOut(BSONElement *pBsonElement);
+ };
+
+
+ class DocumentSourceProject :
+ public DocumentSource,
+ public boost::enable_shared_from_this<DocumentSourceProject> {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceProject();
+ virtual bool eof();
+ virtual bool advance();
+ virtual intrusive_ptr<Document> getCurrent();
+ virtual void optimize();
+
+ /*
+ Create a new DocumentSource that can implement projection.
+
+ @returns the projection DocumentSource
+ */
+ static intrusive_ptr<DocumentSourceProject> create();
+
+ /*
+ Include a field path in a projection.
+
+ @param fieldPath the path of the field to include
+ */
+ void includePath(const string &fieldPath);
+
+ /*
+ Exclude a field path from the projection.
+
+ @param fieldPath the path of the field to exclude
+ */
+ void excludePath(const string &fieldPath);
+
+ /*
+ Add an output Expression in the projection.
+
+ BSON document fields are ordered, so the new field will be
+ appended to the existing set.
+
+ @param fieldName the name of the field as it will appear
+ @param pExpression the expression used to compute the field
+ */
+ void addField(const string &fieldName,
+ const intrusive_ptr<Expression> &pExpression);
+
+ /*
+ Create a new projection DocumentSource from BSON.
+
+ This is a convenience for directly handling BSON, and relies on the
+ above methods.
+
+ @param pBsonElement the BSONElement with an object named $project
+ @returns the created projection
+ */
+ static intrusive_ptr<DocumentSource> createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ static const char projectName[];
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ private:
+ DocumentSourceProject();
+
+ // configuration state
+ bool excludeId;
+ intrusive_ptr<ExpressionObject> pEO;
+ };
+
+
+ class DocumentSourceSort :
+ public DocumentSource {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceSort();
+ virtual bool eof();
+ virtual bool advance();
+ virtual intrusive_ptr<Document> getCurrent();
+ /*
+ TODO
+ Adjacent sorts should reduce to the last sort.
+ virtual bool coalesce(const intrusive_ptr<DocumentSource> &pNextSource);
+ */
+
+ /*
+ Create a new sorting DocumentSource.
+
+ @param pCtx the expression context
+ @returns the DocumentSource
+ */
+ static intrusive_ptr<DocumentSourceSort> create(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ /*
+ Add sort key field.
+
+ Adds a sort key field to the key being built up. A concatenated
+ key is built up by calling this repeatedly.
+
+ @param fieldPath the field path to the key component
+ @param ascending if true, use the key for an ascending sort,
+ otherwise, use it for descending
+ */
+ void addKey(const string &fieldPath, bool ascending);
+
+ /*
+ Write out an object whose contents are the sort key.
+
+ @param pBuilder initialized object builder.
+ @param fieldPrefix specify whether or not to include the field prefix
+ */
+ void sortKeyToBson(BSONObjBuilder *pBuilder, bool usePrefix) const;
+
+ /*
+ Create a sorting DocumentSource from BSON.
+
+ This is a convenience method that uses the above, and operates on
+ a BSONElement that has been deteremined to be an Object with an
+ element named $group.
+
+ @param pBsonElement the BSONELement that defines the group
+ @param pCtx the expression context
+ @returns the grouping DocumentSource
+ */
+ static intrusive_ptr<DocumentSource> createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+
+ static const char sortName[];
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ private:
+ DocumentSourceSort(const intrusive_ptr<ExpressionContext> &pCtx);
+
+ /*
+ Before returning anything, this source must fetch everything from
+ the underlying source and group it. populate() is used to do that
+ on the first call to any method on this source. The populated
+ boolean indicates that this has been done.
+ */
+ void populate();
+ bool populated;
+ long long count;
+
+ /* these two parallel each other */
+ vector<intrusive_ptr<ExpressionFieldPath> > vSortKey;
+ vector<bool> vAscending;
+
+ class Carrier {
+ public:
+ /*
+ We need access to the key for compares, so we have to carry
+ this around.
+ */
+ DocumentSourceSort *pSort;
+
+ intrusive_ptr<Document> pDocument;
+
+ Carrier(DocumentSourceSort *pSort,
+ const intrusive_ptr<Document> &pDocument);
+
+ static bool lessThan(const Carrier &rL, const Carrier &rR);
+ };
+
+ /*
+ Compare two documents according to the specified sort key.
+
+ @param rL reference to the left document
+ @param rR reference to the right document
+ @returns a number less than, equal to, or greater than zero,
+ indicating pL < pR, pL == pR, or pL > pR, respectively
+ */
+ int compare(const intrusive_ptr<Document> &pL,
+ const intrusive_ptr<Document> &pR);
+
+ typedef list<Carrier> ListType;
+ ListType documents;
+
+ ListType::iterator listIterator;
+ intrusive_ptr<Document> pCurrent;
+
+ intrusive_ptr<ExpressionContext> pCtx;
+ };
+
+
+ class DocumentSourceLimit :
+ public DocumentSource {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceLimit();
+ virtual bool eof();
+ virtual bool advance();
+ virtual intrusive_ptr<Document> getCurrent();
+
+ /*
+ Create a new limiting DocumentSource.
+
+ @param pCtx the expression context
+ @returns the DocumentSource
+ */
+ static intrusive_ptr<DocumentSourceLimit> create(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ /*
+ Create a limiting DocumentSource from BSON.
+
+ This is a convenience method that uses the above, and operates on
+ a BSONElement that has been deteremined to be an Object with an
+ element named $limit.
+
+ @param pBsonElement the BSONELement that defines the limit
+ @param pCtx the expression context
+ @returns the grouping DocumentSource
+ */
+ static intrusive_ptr<DocumentSource> createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+
+ static const char limitName[];
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ private:
+ DocumentSourceLimit(const intrusive_ptr<ExpressionContext> &pCtx);
+
+ long long limit;
+ long long count;
+ intrusive_ptr<Document> pCurrent;
+
+ intrusive_ptr<ExpressionContext> pCtx;
+ };
+
+ class DocumentSourceSkip :
+ public DocumentSource {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceSkip();
+ virtual bool eof();
+ virtual bool advance();
+ virtual intrusive_ptr<Document> getCurrent();
+
+ /*
+ Create a new skipping DocumentSource.
+
+ @param pCtx the expression context
+ @returns the DocumentSource
+ */
+ static intrusive_ptr<DocumentSourceSkip> create(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ /*
+ Create a skipping DocumentSource from BSON.
+
+ This is a convenience method that uses the above, and operates on
+ a BSONElement that has been deteremined to be an Object with an
+ element named $skip.
+
+ @param pBsonElement the BSONELement that defines the skip
+ @param pCtx the expression context
+ @returns the grouping DocumentSource
+ */
+ static intrusive_ptr<DocumentSource> createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+
+ static const char skipName[];
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ private:
+ DocumentSourceSkip(const intrusive_ptr<ExpressionContext> &pCtx);
+
+ /*
+ Skips initial documents.
+ */
+ void skipper();
+
+ long long skip;
+ long long count;
+ intrusive_ptr<Document> pCurrent;
+
+ intrusive_ptr<ExpressionContext> pCtx;
+ };
+
+
+ class DocumentSourceUnwind :
+ public DocumentSource,
+ public boost::enable_shared_from_this<DocumentSourceUnwind> {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceUnwind();
+ virtual bool eof();
+ virtual bool advance();
+ virtual intrusive_ptr<Document> getCurrent();
+
+ /*
+ Create a new DocumentSource that can implement unwind.
+
+ @returns the projection DocumentSource
+ */
+ static intrusive_ptr<DocumentSourceUnwind> create();
+
+ /*
+ Specify the field to unwind. There must be exactly one before
+ the pipeline begins execution.
+
+ @param rFieldPath - path to the field to unwind
+ */
+ void unwindField(const FieldPath &rFieldPath);
+
+ /*
+ Create a new projection DocumentSource from BSON.
+
+ This is a convenience for directly handling BSON, and relies on the
+ above methods.
+
+ @param pBsonElement the BSONElement with an object named $project
+ @returns the created projection
+ */
+ static intrusive_ptr<DocumentSource> createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ static const char unwindName[];
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ private:
+ DocumentSourceUnwind();
+
+ // configuration state
+ FieldPath unwindPath;
+
+ vector<int> fieldIndex; /* for the current document, the indices
+ leading down to the field being unwound */
+
+ // iteration state
+ intrusive_ptr<Document> pNoUnwindDocument;
+ // document to return, pre-unwind
+ intrusive_ptr<const Value> pUnwindArray; // field being unwound
+ intrusive_ptr<ValueIterator> pUnwinder; // iterator used for unwinding
+ intrusive_ptr<const Value> pUnwindValue; // current value
+
+ /*
+ Clear all the state related to unwinding an array.
+ */
+ void resetArray();
+
+ /*
+ Clone the current document being unwound.
+
+ This is a partial deep clone. Because we're going to replace the
+ value at the end, we have to replace everything along the path
+ leading to that in order to not share that change with any other
+ clones (or the original) that we've made.
+
+ This expects pUnwindValue to have been set by a prior call to
+ advance(). However, pUnwindValue may also be NULL, in which case
+ the field will be removed -- this is the action for an empty
+ array.
+
+ @returns a partial deep clone of pNoUnwindDocument
+ */
+ intrusive_ptr<Document> clonePath() const;
+
+ };
+
+}
+
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+ inline void DocumentSourceGroup::setIdExpression(
+ const intrusive_ptr<Expression> &pExpression) {
+ pIdExpression = pExpression;
+ }
+
+ inline void DocumentSourceUnwind::resetArray() {
+ pNoUnwindDocument.reset();
+ pUnwindArray.reset();
+ pUnwinder.reset();
+ pUnwindValue.reset();
+ }
+
+ inline DocumentSourceSort::Carrier::Carrier(
+ DocumentSourceSort *pTheSort,
+ const intrusive_ptr<Document> &pTheDocument):
+ pSort(pTheSort),
+ pDocument(pTheDocument) {
+ }
+}
diff --git a/src/mongo/db/pipeline/document_source_bson_array.cpp b/src/mongo/db/pipeline/document_source_bson_array.cpp
new file mode 100755
index 00000000000..5d187b03ef9
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_bson_array.cpp
@@ -0,0 +1,83 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/pipeline/document.h"
+
+namespace mongo {
+
+ DocumentSourceBsonArray::~DocumentSourceBsonArray() {
+ }
+
+ bool DocumentSourceBsonArray::eof() {
+ return !haveCurrent;
+ }
+
+ bool DocumentSourceBsonArray::advance() {
+ if (eof())
+ return false;
+
+ if (!arrayIterator.more()) {
+ haveCurrent = false;
+ return false;
+ }
+
+ currentElement = arrayIterator.next();
+ return true;
+ }
+
+ intrusive_ptr<Document> DocumentSourceBsonArray::getCurrent() {
+ assert(haveCurrent);
+ BSONObj documentObj(currentElement.Obj());
+ intrusive_ptr<Document> pDocument(
+ Document::createFromBsonObj(&documentObj));
+ return pDocument;
+ }
+
+ void DocumentSourceBsonArray::setSource(
+ const intrusive_ptr<DocumentSource> &pSource) {
+ /* this doesn't take a source */
+ assert(false);
+ }
+
+ DocumentSourceBsonArray::DocumentSourceBsonArray(
+ BSONElement *pBsonElement):
+ embeddedObject(pBsonElement->embeddedObject()),
+ arrayIterator(embeddedObject),
+ haveCurrent(false) {
+ if (arrayIterator.more()) {
+ currentElement = arrayIterator.next();
+ haveCurrent = true;
+ }
+ }
+
+ intrusive_ptr<DocumentSourceBsonArray> DocumentSourceBsonArray::create(
+ BSONElement *pBsonElement) {
+
+ assert(pBsonElement->type() == Array);
+ intrusive_ptr<DocumentSourceBsonArray> pSource(
+ new DocumentSourceBsonArray(pBsonElement));
+
+ return pSource;
+ }
+
+ void DocumentSourceBsonArray::sourceToBson(BSONObjBuilder *pBuilder) const {
+ assert(false); // this has no analog in the BSON world
+ }
+}
diff --git a/src/mongo/db/pipeline/document_source_command_futures.cpp b/src/mongo/db/pipeline/document_source_command_futures.cpp
new file mode 100755
index 00000000000..61a257cf16f
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_command_futures.cpp
@@ -0,0 +1,132 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+namespace mongo {
+
+ DocumentSourceCommandFutures::~DocumentSourceCommandFutures() {
+ }
+
+ bool DocumentSourceCommandFutures::eof() {
+ /* if we haven't even started yet, do so */
+ if (!pCurrent.get())
+ getNextDocument();
+
+ return (pCurrent.get() == NULL);
+ }
+
+ bool DocumentSourceCommandFutures::advance() {
+ if (eof())
+ return false;
+
+ /* advance */
+ getNextDocument();
+
+ return (pCurrent.get() != NULL);
+ }
+
+ intrusive_ptr<Document> DocumentSourceCommandFutures::getCurrent() {
+ assert(!eof());
+ return pCurrent;
+ }
+
+ void DocumentSourceCommandFutures::setSource(
+ const intrusive_ptr<DocumentSource> &pSource) {
+ /* this doesn't take a source */
+ assert(false);
+ }
+
+ void DocumentSourceCommandFutures::sourceToBson(
+ BSONObjBuilder *pBuilder) const {
+ /* this has no BSON equivalent */
+ assert(false);
+ }
+
+ DocumentSourceCommandFutures::DocumentSourceCommandFutures(
+ string &theErrmsg, FuturesList *pList):
+ newSource(false),
+ pBsonSource(),
+ pCurrent(),
+ iterator(pList->begin()),
+ listEnd(pList->end()),
+ errmsg(theErrmsg) {
+ }
+
+ intrusive_ptr<DocumentSourceCommandFutures>
+ DocumentSourceCommandFutures::create(
+ string &errmsg, FuturesList *pList) {
+ intrusive_ptr<DocumentSourceCommandFutures> pSource(
+ new DocumentSourceCommandFutures(errmsg, pList));
+ return pSource;
+ }
+
+ void DocumentSourceCommandFutures::getNextDocument() {
+ while(true) {
+ if (!pBsonSource.get()) {
+ /* if there aren't any more futures, we're done */
+ if (iterator == listEnd) {
+ pCurrent.reset();
+ return;
+ }
+
+ /* grab the next command result */
+ shared_ptr<Future::CommandResult> pResult(*iterator);
+ ++iterator;
+
+ /* try to wait for it */
+ if (!pResult->join()) {
+ error() << "sharded pipeline failed on shard: " <<
+ pResult->getServer() << " error: " <<
+ pResult->result() << endl;
+ errmsg += "-- mongod pipeline failed: ";
+ errmsg += pResult->result().toString();
+
+ /* move on to the next command future */
+ continue;
+ }
+
+ /* grab the result array out of the shard server's response */
+ BSONObj shardResult(pResult->result());
+ BSONObjIterator objIterator(shardResult);
+ while(objIterator.more()) {
+ BSONElement element(objIterator.next());
+ const char *pFieldName = element.fieldName();
+
+ /* find the result array and quit this loop */
+ if (strcmp(pFieldName, "result") == 0) {
+ pBsonSource = DocumentSourceBsonArray::create(&element);
+ newSource = true;
+ break;
+ }
+ }
+ }
+
+ /* if we're done with this shard's results, try the next */
+ if (pBsonSource->eof() ||
+ (!newSource && !pBsonSource->advance())) {
+ pBsonSource.reset();
+ continue;
+ }
+
+ pCurrent = pBsonSource->getCurrent();
+ newSource = false;
+ return;
+ }
+ }
+}
diff --git a/src/mongo/db/pipeline/document_source_filter.cpp b/src/mongo/db/pipeline/document_source_filter.cpp
new file mode 100755
index 00000000000..66e57ba2e93
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_filter.cpp
@@ -0,0 +1,98 @@
+/**
+* Copyright (C) 2011 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+ const char DocumentSourceFilter::filterName[] = "$filter";
+
+ DocumentSourceFilter::~DocumentSourceFilter() {
+ }
+
+ bool DocumentSourceFilter::coalesce(
+ const intrusive_ptr<DocumentSource> &pNextSource) {
+
+ /* we only know how to coalesce other filters */
+ DocumentSourceFilter *pDocFilter =
+ dynamic_cast<DocumentSourceFilter *>(pNextSource.get());
+ if (!pDocFilter)
+ return false;
+
+ /*
+ Two adjacent filters can be combined by creating a conjunction of
+ their predicates.
+ */
+ intrusive_ptr<ExpressionNary> pAnd(ExpressionAnd::create());
+ pAnd->addOperand(pFilter);
+ pAnd->addOperand(pDocFilter->pFilter);
+ pFilter = pAnd;
+
+ return true;
+ }
+
+ void DocumentSourceFilter::optimize() {
+ pFilter = pFilter->optimize();
+ }
+
+ void DocumentSourceFilter::sourceToBson(BSONObjBuilder *pBuilder) const {
+ pFilter->addToBsonObj(pBuilder, filterName, 0);
+ }
+
+ bool DocumentSourceFilter::accept(
+ const intrusive_ptr<Document> &pDocument) const {
+ intrusive_ptr<const Value> pValue(pFilter->evaluate(pDocument));
+ return pValue->coerceToBool();
+ }
+
+ intrusive_ptr<DocumentSource> DocumentSourceFilter::createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ uassert(15946, "a document filter expression must be an object",
+ pBsonElement->type() == Object);
+
+ Expression::ObjectCtx oCtx(0);
+ intrusive_ptr<Expression> pExpression(
+ Expression::parseObject(pBsonElement, &oCtx));
+ intrusive_ptr<DocumentSourceFilter> pFilter(
+ DocumentSourceFilter::create(pExpression));
+
+ return pFilter;
+ }
+
+ intrusive_ptr<DocumentSourceFilter> DocumentSourceFilter::create(
+ const intrusive_ptr<Expression> &pFilter) {
+ intrusive_ptr<DocumentSourceFilter> pSource(
+ new DocumentSourceFilter(pFilter));
+ return pSource;
+ }
+
+ DocumentSourceFilter::DocumentSourceFilter(
+ const intrusive_ptr<Expression> &pTheFilter):
+ DocumentSourceFilterBase(),
+ pFilter(pTheFilter) {
+ }
+
+ void DocumentSourceFilter::toMatcherBson(BSONObjBuilder *pBuilder) const {
+ pFilter->toMatcherBson(pBuilder, 0);
+ }
+}
diff --git a/src/mongo/db/pipeline/document_source_filter_base.cpp b/src/mongo/db/pipeline/document_source_filter_base.cpp
new file mode 100755
index 00000000000..dbda34b7151
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_filter_base.cpp
@@ -0,0 +1,85 @@
+/**
+* Copyright (C) 2011 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+ DocumentSourceFilterBase::~DocumentSourceFilterBase() {
+ }
+
+ void DocumentSourceFilterBase::findNext() {
+ /* only do this the first time */
+ if (unstarted) {
+ hasNext = !pSource->eof();
+ unstarted = false;
+ }
+
+ while(hasNext) {
+ boost::intrusive_ptr<Document> pDocument(pSource->getCurrent());
+ hasNext = pSource->advance();
+
+ if (accept(pDocument)) {
+ pCurrent = pDocument;
+ return;
+ }
+ }
+
+ pCurrent.reset();
+ }
+
+ bool DocumentSourceFilterBase::eof() {
+ if (unstarted)
+ findNext();
+
+ return (pCurrent.get() == NULL);
+ }
+
+ bool DocumentSourceFilterBase::advance() {
+ if (unstarted)
+ findNext();
+
+ /*
+ This looks weird after the above, but is correct. Note that calling
+ getCurrent() when first starting already yields the first document
+ in the collection. Calling advance() without using getCurrent()
+ first will skip over the first item.
+ */
+ findNext();
+
+ return (pCurrent.get() != NULL);
+ }
+
+ boost::intrusive_ptr<Document> DocumentSourceFilterBase::getCurrent() {
+ if (unstarted)
+ findNext();
+
+ assert(pCurrent.get() != NULL);
+ return pCurrent;
+ }
+
+ DocumentSourceFilterBase::DocumentSourceFilterBase():
+ unstarted(true),
+ hasNext(false),
+ pCurrent() {
+ }
+}
diff --git a/src/mongo/db/pipeline/document_source_group.cpp b/src/mongo/db/pipeline/document_source_group.cpp
new file mode 100755
index 00000000000..244561589da
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_group.cpp
@@ -0,0 +1,391 @@
+/**
+* Copyright (C) 2011 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/accumulator.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+ const char DocumentSourceGroup::groupName[] = "$group";
+
+ DocumentSourceGroup::~DocumentSourceGroup() {
+ }
+
+ bool DocumentSourceGroup::eof() {
+ if (!populated)
+ populate();
+
+ return (groupsIterator == groups.end());
+ }
+
+ bool DocumentSourceGroup::advance() {
+ if (!populated)
+ populate();
+
+ assert(groupsIterator != groups.end());
+
+ ++groupsIterator;
+ if (groupsIterator == groups.end()) {
+ pCurrent.reset();
+ return false;
+ }
+
+ pCurrent = makeDocument(groupsIterator);
+ return true;
+ }
+
+ intrusive_ptr<Document> DocumentSourceGroup::getCurrent() {
+ if (!populated)
+ populate();
+
+ return pCurrent;
+ }
+
+ void DocumentSourceGroup::sourceToBson(BSONObjBuilder *pBuilder) const {
+ BSONObjBuilder insides;
+
+ /* add the _id */
+ pIdExpression->addToBsonObj(&insides, Document::idName.c_str(), 0);
+
+ /* add the remaining fields */
+ const size_t n = vFieldName.size();
+ for(size_t i = 0; i < n; ++i) {
+ intrusive_ptr<Accumulator> pA((*vpAccumulatorFactory[i])(pCtx));
+ pA->addOperand(vpExpression[i]);
+ pA->addToBsonObj(&insides, vFieldName[i], 0);
+ }
+
+ pBuilder->append(groupName, insides.done());
+ }
+
+ intrusive_ptr<DocumentSourceGroup> DocumentSourceGroup::create(
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ intrusive_ptr<DocumentSourceGroup> pSource(
+ new DocumentSourceGroup(pCtx));
+ return pSource;
+ }
+
+ DocumentSourceGroup::DocumentSourceGroup(
+ const intrusive_ptr<ExpressionContext> &pTheCtx):
+ populated(false),
+ pIdExpression(),
+ groups(),
+ vFieldName(),
+ vpAccumulatorFactory(),
+ vpExpression(),
+ pCtx(pTheCtx) {
+ }
+
+ void DocumentSourceGroup::addAccumulator(
+ string fieldName,
+ intrusive_ptr<Accumulator> (*pAccumulatorFactory)(
+ const intrusive_ptr<ExpressionContext> &),
+ const intrusive_ptr<Expression> &pExpression) {
+ vFieldName.push_back(fieldName);
+ vpAccumulatorFactory.push_back(pAccumulatorFactory);
+ vpExpression.push_back(pExpression);
+ }
+
+
+ struct GroupOpDesc {
+ const char *pName;
+ intrusive_ptr<Accumulator> (*pFactory)(
+ const intrusive_ptr<ExpressionContext> &);
+ };
+
+ static int GroupOpDescCmp(const void *pL, const void *pR) {
+ return strcmp(((const GroupOpDesc *)pL)->pName,
+ ((const GroupOpDesc *)pR)->pName);
+ }
+
+ /*
+ Keep these sorted alphabetically so we can bsearch() them using
+ GroupOpDescCmp() above.
+ */
+ static const GroupOpDesc GroupOpTable[] = {
+ {"$addToSet", AccumulatorAddToSet::create},
+ {"$avg", AccumulatorAvg::create},
+ {"$first", AccumulatorFirst::create},
+ {"$last", AccumulatorLast::create},
+ {"$max", AccumulatorMinMax::createMax},
+ {"$min", AccumulatorMinMax::createMin},
+ {"$push", AccumulatorPush::create},
+ {"$sum", AccumulatorSum::create},
+ };
+
+ static const size_t NGroupOp = sizeof(GroupOpTable)/sizeof(GroupOpTable[0]);
+
+ intrusive_ptr<DocumentSource> DocumentSourceGroup::createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ uassert(15947, "a group's fields must be specified in an object",
+ pBsonElement->type() == Object);
+
+ intrusive_ptr<DocumentSourceGroup> pGroup(
+ DocumentSourceGroup::create(pCtx));
+ bool idSet = false;
+
+ BSONObj groupObj(pBsonElement->Obj());
+ BSONObjIterator groupIterator(groupObj);
+ while(groupIterator.more()) {
+ BSONElement groupField(groupIterator.next());
+ const char *pFieldName = groupField.fieldName();
+
+ if (strcmp(pFieldName, Document::idName.c_str()) == 0) {
+ uassert(15948, "a group's _id may only be specified once",
+ !idSet);
+
+ BSONType groupType = groupField.type();
+
+ if (groupType == Object) {
+ /*
+ Use the projection-like set of field paths to create the
+ group-by key.
+ */
+ Expression::ObjectCtx oCtx(
+ Expression::ObjectCtx::DOCUMENT_OK);
+ intrusive_ptr<Expression> pId(
+ Expression::parseObject(&groupField, &oCtx));
+
+ pGroup->setIdExpression(pId);
+ idSet = true;
+ }
+ else if (groupType == String) {
+ string groupString(groupField.String());
+ const char *pGroupString = groupString.c_str();
+ if ((groupString.length() == 0) ||
+ (pGroupString[0] != '$'))
+ goto StringConstantId;
+
+ string pathString(
+ Expression::removeFieldPrefix(groupString));
+ intrusive_ptr<ExpressionFieldPath> pFieldPath(
+ ExpressionFieldPath::create(pathString));
+ pGroup->setIdExpression(pFieldPath);
+ idSet = true;
+ }
+ else {
+ /* pick out the constant types that are allowed */
+ switch(groupType) {
+ case NumberDouble:
+ case String:
+ case Object:
+ case Array:
+ case jstOID:
+ case Bool:
+ case Date:
+ case NumberInt:
+ case Timestamp:
+ case NumberLong:
+ case jstNULL:
+ StringConstantId: // from string case above
+ {
+ intrusive_ptr<const Value> pValue(
+ Value::createFromBsonElement(&groupField));
+ intrusive_ptr<ExpressionConstant> pConstant(
+ ExpressionConstant::create(pValue));
+ pGroup->setIdExpression(pConstant);
+ idSet = true;
+ break;
+ }
+
+ default:
+ uassert(15949, str::stream() <<
+ "a group's _id may not include fields of BSON type " << groupType,
+ false);
+ }
+ }
+ }
+ else {
+ /*
+ Treat as a projection field with the additional ability to
+ add aggregation operators.
+ */
+ uassert(15950, str::stream() <<
+ "the group aggregate field name " <<
+ *pFieldName << " cannot be an operator name",
+ *pFieldName != '$');
+
+ uassert(15951, str::stream() <<
+ "the group aggregate field " << *pFieldName <<
+ "must be defined as an expression inside an object",
+ groupField.type() == Object);
+
+ BSONObj subField(groupField.Obj());
+ BSONObjIterator subIterator(subField);
+ size_t subCount = 0;
+ for(; subIterator.more(); ++subCount) {
+ BSONElement subElement(subIterator.next());
+
+ /* look for the specified operator */
+ GroupOpDesc key;
+ key.pName = subElement.fieldName();
+ const GroupOpDesc *pOp =
+ (const GroupOpDesc *)bsearch(
+ &key, GroupOpTable, NGroupOp, sizeof(GroupOpDesc),
+ GroupOpDescCmp);
+
+ uassert(15952, str::stream() <<
+ "unknown group operator \"" <<
+ key.pName << "\"",
+ pOp);
+
+ intrusive_ptr<Expression> pGroupExpr;
+
+ BSONType elementType = subElement.type();
+ if (elementType == Object) {
+ Expression::ObjectCtx oCtx(
+ Expression::ObjectCtx::DOCUMENT_OK);
+ pGroupExpr = Expression::parseObject(
+ &subElement, &oCtx);
+ }
+ else if (elementType == Array) {
+ uassert(15953, str::stream() <<
+ "aggregating group operators are unary (" <<
+ key.pName << ")", false);
+ }
+ else { /* assume its an atomic single operand */
+ pGroupExpr = Expression::parseOperand(&subElement);
+ }
+
+ pGroup->addAccumulator(
+ pFieldName, pOp->pFactory, pGroupExpr);
+ }
+
+ uassert(15954, str::stream() <<
+ "the computed aggregate \"" <<
+ pFieldName << "\" must specify exactly one operator",
+ subCount == 1);
+ }
+ }
+
+ uassert(15955, "a group specification must include an _id", idSet);
+
+ return pGroup;
+ }
+
+ void DocumentSourceGroup::populate() {
+ for(bool hasNext = !pSource->eof(); hasNext;
+ hasNext = pSource->advance()) {
+ intrusive_ptr<Document> pDocument(pSource->getCurrent());
+
+ /* get the _id document */
+ intrusive_ptr<const Value> pId(pIdExpression->evaluate(pDocument));
+ uassert(15956, "the _id field for a group must not be undefined",
+ pId->getType() != Undefined);
+
+ /*
+ Look for the _id value in the map; if it's not there, add a
+ new entry with a blank accumulator.
+ */
+ vector<intrusive_ptr<Accumulator> > *pGroup;
+ GroupsType::iterator it(groups.find(pId));
+ if (it != groups.end()) {
+ /* point at the existing accumulators */
+ pGroup = &it->second;
+ }
+ else {
+ /* insert a new group into the map */
+ groups.insert(it,
+ pair<intrusive_ptr<const Value>,
+ vector<intrusive_ptr<Accumulator> > >(
+ pId, vector<intrusive_ptr<Accumulator> >()));
+
+ /* find the accumulator vector (the map value) */
+ it = groups.find(pId);
+ pGroup = &it->second;
+
+ /* add the accumulators */
+ const size_t n = vpAccumulatorFactory.size();
+ pGroup->reserve(n);
+ for(size_t i = 0; i < n; ++i) {
+ intrusive_ptr<Accumulator> pAccumulator(
+ (*vpAccumulatorFactory[i])(pCtx));
+ pAccumulator->addOperand(vpExpression[i]);
+ pGroup->push_back(pAccumulator);
+ }
+ }
+
+ /* point at the existing key */
+ // unneeded atm // pId = it.first;
+
+ /* tickle all the accumulators for the group we found */
+ const size_t n = pGroup->size();
+ for(size_t i = 0; i < n; ++i)
+ (*pGroup)[i]->evaluate(pDocument);
+ }
+
+ /* start the group iterator */
+ groupsIterator = groups.begin();
+ if (groupsIterator != groups.end())
+ pCurrent = makeDocument(groupsIterator);
+ populated = true;
+ }
+
+ intrusive_ptr<Document> DocumentSourceGroup::makeDocument(
+ const GroupsType::iterator &rIter) {
+ vector<intrusive_ptr<Accumulator> > *pGroup = &rIter->second;
+ const size_t n = vFieldName.size();
+ intrusive_ptr<Document> pResult(Document::create(1 + n));
+
+ /* add the _id field */
+ pResult->addField(Document::idName, rIter->first);
+
+ /* add the rest of the fields */
+ for(size_t i = 0; i < n; ++i) {
+ intrusive_ptr<const Value> pValue((*pGroup)[i]->getValue());
+ if (pValue->getType() != Undefined)
+ pResult->addField(vFieldName[i], pValue);
+ }
+
+ return pResult;
+ }
+
+ intrusive_ptr<DocumentSource> DocumentSourceGroup::createMerger() {
+ intrusive_ptr<DocumentSourceGroup> pMerger(
+ DocumentSourceGroup::create(pCtx));
+
+ /* the merger will use the same grouping key */
+ pMerger->setIdExpression(ExpressionFieldPath::create(
+ Document::idName.c_str()));
+
+ const size_t n = vFieldName.size();
+ for(size_t i = 0; i < n; ++i) {
+ /*
+ The merger's output field names will be the same, as will the
+ accumulator factories. However, for some accumulators, the
+ expression to be accumulated will be different. The original
+ accumulator may be collecting an expression based on a field
+ expression or constant. Here, we accumulate the output of the
+ same name from the prior group.
+ */
+ pMerger->addAccumulator(
+ vFieldName[i], vpAccumulatorFactory[i],
+ ExpressionFieldPath::create(vFieldName[i]));
+ }
+
+ return pMerger;
+ }
+}
+
+
diff --git a/src/mongo/db/pipeline/document_source_limit.cpp b/src/mongo/db/pipeline/document_source_limit.cpp
new file mode 100644
index 00000000000..a73d4da2005
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_limit.cpp
@@ -0,0 +1,83 @@
+/**
+* Copyright (C) 2011 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+ const char DocumentSourceLimit::limitName[] = "$limit";
+
+ DocumentSourceLimit::DocumentSourceLimit(const intrusive_ptr<ExpressionContext> &pTheCtx):
+ limit(0),
+ count(0),
+ pCtx(pTheCtx) {
+ }
+
+ DocumentSourceLimit::~DocumentSourceLimit() {
+ }
+
+ bool DocumentSourceLimit::eof() {
+ return pSource->eof() || count >= limit;
+ }
+
+ bool DocumentSourceLimit::advance() {
+ ++count;
+ if (count >= limit) {
+ pCurrent.reset();
+ return false;
+ }
+ pCurrent = pSource->getCurrent();
+ return pSource->advance();
+ }
+
+ intrusive_ptr<Document> DocumentSourceLimit::getCurrent() {
+ return pSource->getCurrent();
+ }
+
+ void DocumentSourceLimit::sourceToBson(BSONObjBuilder *pBuilder) const {
+ pBuilder->append("$limit", limit);
+ }
+
+ intrusive_ptr<DocumentSourceLimit> DocumentSourceLimit::create(
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ intrusive_ptr<DocumentSourceLimit> pSource(
+ new DocumentSourceLimit(pCtx));
+ return pSource;
+ }
+
+ intrusive_ptr<DocumentSource> DocumentSourceLimit::createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ uassert(15957, "the limit must be specified as a number",
+ pBsonElement->isNumber());
+
+ intrusive_ptr<DocumentSourceLimit> pLimit(
+ DocumentSourceLimit::create(pCtx));
+
+ pLimit->limit = (int)pBsonElement->numberLong();
+ uassert(15958, "the limit must be positive",
+ pLimit->limit > 0);
+
+ return pLimit;
+ }
+}
diff --git a/src/mongo/db/pipeline/document_source_match.cpp b/src/mongo/db/pipeline/document_source_match.cpp
new file mode 100755
index 00000000000..bedac3ef717
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_match.cpp
@@ -0,0 +1,80 @@
+/**
+* Copyright (C) 2011 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/matcher.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+
+namespace mongo {
+
+ const char DocumentSourceMatch::matchName[] = "$match";
+
+ DocumentSourceMatch::~DocumentSourceMatch() {
+ }
+
+ void DocumentSourceMatch::sourceToBson(BSONObjBuilder *pBuilder) const {
+ const BSONObj *pQuery = matcher.getQuery();
+ pBuilder->append(matchName, *pQuery);
+ }
+
+ bool DocumentSourceMatch::accept(
+ const intrusive_ptr<Document> &pDocument) const {
+
+ /*
+ The matcher only takes BSON documents, so we have to make one.
+
+ LATER
+ We could optimize this by making a document with only the
+ fields referenced by the Matcher. We could do this by looking inside
+ the Matcher's BSON before it is created, and recording those. The
+ easiest implementation might be to hold onto an ExpressionDocument
+ in here, and give that pDocument to create the created subset of
+ fields, and then convert that instead.
+ */
+ BSONObjBuilder objBuilder;
+ pDocument->toBson(&objBuilder);
+ BSONObj obj(objBuilder.done());
+
+ return matcher.matches(obj);
+ }
+
+ intrusive_ptr<DocumentSource> DocumentSourceMatch::createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ uassert(15959, "the match filter must be an expression in an object",
+ pBsonElement->type() == Object);
+
+ intrusive_ptr<DocumentSourceMatch> pMatcher(
+ new DocumentSourceMatch(pBsonElement->Obj()));
+
+ return pMatcher;
+ }
+
+ void DocumentSourceMatch::toMatcherBson(BSONObjBuilder *pBuilder) const {
+ const BSONObj *pQuery = matcher.getQuery();
+ pBuilder->appendElements(*pQuery);
+ }
+
+ DocumentSourceMatch::DocumentSourceMatch(const BSONObj &query):
+ DocumentSourceFilterBase(),
+ matcher(query) {
+ }
+}
diff --git a/src/mongo/db/pipeline/document_source_out.cpp b/src/mongo/db/pipeline/document_source_out.cpp
new file mode 100755
index 00000000000..5a30342d25c
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_out.cpp
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+
+namespace mongo {
+
+ const char DocumentSourceOut::outName[] = "$out";
+
+ DocumentSourceOut::~DocumentSourceOut() {
+ }
+
+ bool DocumentSourceOut::eof() {
+ return pSource->eof();
+ }
+
+ bool DocumentSourceOut::advance() {
+ return pSource->advance();
+ }
+
+ boost::intrusive_ptr<Document> DocumentSourceOut::getCurrent() {
+ return pSource->getCurrent();
+ }
+
+ DocumentSourceOut::DocumentSourceOut(BSONElement *pBsonElement) {
+ assert(false && "unimplemented");
+ }
+
+ intrusive_ptr<DocumentSourceOut> DocumentSourceOut::createFromBson(
+ BSONElement *pBsonElement) {
+ intrusive_ptr<DocumentSourceOut> pSource(
+ new DocumentSourceOut(pBsonElement));
+
+ return pSource;
+ }
+
+ void DocumentSourceOut::sourceToBson(BSONObjBuilder *pBuilder) const {
+ assert(false); // CW TODO
+ }
+}
diff --git a/src/mongo/db/pipeline/document_source_project.cpp b/src/mongo/db/pipeline/document_source_project.cpp
new file mode 100755
index 00000000000..bb7a0b5a6d9
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_project.cpp
@@ -0,0 +1,201 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+ const char DocumentSourceProject::projectName[] = "$project";
+
+ DocumentSourceProject::~DocumentSourceProject() {
+ }
+
+ DocumentSourceProject::DocumentSourceProject():
+ excludeId(false),
+ pEO(ExpressionObject::create()) {
+ }
+
+ bool DocumentSourceProject::eof() {
+ return pSource->eof();
+ }
+
+ bool DocumentSourceProject::advance() {
+ return pSource->advance();
+ }
+
+ intrusive_ptr<Document> DocumentSourceProject::getCurrent() {
+ intrusive_ptr<Document> pInDocument(pSource->getCurrent());
+
+ /* create the result document */
+ const size_t sizeHint =
+ pEO->getSizeHint(pInDocument) + (excludeId ? 0 : 1);
+ intrusive_ptr<Document> pResultDocument(Document::create(sizeHint));
+
+ if (!excludeId) {
+ intrusive_ptr<const Value> pId(
+ pInDocument->getField(Document::idName));
+ pResultDocument->addField(Document::idName, pId);
+ }
+
+ /* use the ExpressionObject to create the base result */
+ pEO->addToDocument(pResultDocument, pInDocument);
+
+ return pResultDocument;
+ }
+
+ void DocumentSourceProject::optimize() {
+ intrusive_ptr<Expression> pE(pEO->optimize());
+ pEO = dynamic_pointer_cast<ExpressionObject>(pE);
+ }
+
+ void DocumentSourceProject::sourceToBson(BSONObjBuilder *pBuilder) const {
+ BSONObjBuilder insides;
+ if (excludeId)
+ insides.append(Document::idName, false);
+ pEO->documentToBson(&insides, 0);
+ pBuilder->append(projectName, insides.done());
+ }
+
+ intrusive_ptr<DocumentSourceProject> DocumentSourceProject::create() {
+ intrusive_ptr<DocumentSourceProject> pSource(
+ new DocumentSourceProject());
+ return pSource;
+ }
+
+ void DocumentSourceProject::addField(
+ const string &fieldName, const intrusive_ptr<Expression> &pExpression) {
+ uassert(15960,
+ "projection fields must be defined by non-empty expressions",
+ pExpression);
+
+ pEO->addField(fieldName, pExpression);
+ }
+
+ void DocumentSourceProject::includePath(const string &fieldPath) {
+ if (Document::idName.compare(fieldPath) == 0) {
+ uassert(15961, str::stream() << projectName <<
+ ": _id cannot be included once it has been excluded",
+ !excludeId);
+
+ return;
+ }
+
+ pEO->includePath(fieldPath);
+ }
+
+ void DocumentSourceProject::excludePath(const string &fieldPath) {
+ if (Document::idName.compare(fieldPath) == 0) {
+ excludeId = true;
+ return;
+ }
+
+ pEO->excludePath(fieldPath);
+ }
+
+ intrusive_ptr<DocumentSource> DocumentSourceProject::createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ /* validate */
+ uassert(15969, str::stream() << projectName <<
+ " specification must be an object",
+ pBsonElement->type() == Object);
+
+ /* chain the projection onto the original source */
+ intrusive_ptr<DocumentSourceProject> pProject(
+ DocumentSourceProject::create());
+
+ /*
+ Pull out the $project object. This should just be a list of
+ field inclusion or exclusion specifications. Note you can't do
+ both, except for the case of _id.
+ */
+ BSONObj projectObj(pBsonElement->Obj());
+ BSONObjIterator fieldIterator(projectObj);
+ Expression::ObjectCtx objectCtx(
+ Expression::ObjectCtx::DOCUMENT_OK);
+ while(fieldIterator.more()) {
+ BSONElement outFieldElement(fieldIterator.next());
+ string outFieldPath(outFieldElement.fieldName());
+ string inFieldName(outFieldPath);
+ BSONType specType = outFieldElement.type();
+ int fieldInclusion = -1;
+
+ switch(specType) {
+ case NumberDouble: {
+ double inclusion = outFieldElement.numberDouble();
+ fieldInclusion = static_cast<int>(inclusion);
+ goto IncludeExclude;
+ }
+
+ case NumberInt:
+ /* just a plain integer include/exclude specification */
+ fieldInclusion = outFieldElement.numberInt();
+
+IncludeExclude:
+ uassert(15970, str::stream() <<
+ "field inclusion or exclusion specification for \"" <<
+ outFieldPath <<
+ "\" must be true, 1, false, or zero",
+ ((fieldInclusion == 0) || (fieldInclusion == 1)));
+
+ if (fieldInclusion == 0)
+ pProject->excludePath(outFieldPath);
+ else
+ pProject->includePath(outFieldPath);
+ break;
+
+ case Bool:
+ /* just a plain boolean include/exclude specification */
+ fieldInclusion = (outFieldElement.Bool() ? 1 : 0);
+ goto IncludeExclude;
+
+ case String:
+ /* include a field, with rename */
+ fieldInclusion = 1;
+ inFieldName = outFieldElement.String();
+ pProject->addField(
+ outFieldPath,
+ ExpressionFieldPath::create(
+ Expression::removeFieldPrefix(inFieldName)));
+ break;
+
+ case Object: {
+ intrusive_ptr<Expression> pDocument(
+ Expression::parseObject(&outFieldElement, &objectCtx));
+
+ /* add The document expression to the projection */
+ pProject->addField(outFieldPath, pDocument);
+ break;
+ }
+
+ default:
+ uassert(15971, str::stream() <<
+ "invalid BSON type (" << specType <<
+ ") for " << projectName <<
+ " field " << outFieldPath, false);
+ }
+
+ }
+
+ return pProject;
+ }
+}
diff --git a/src/mongo/db/pipeline/document_source_skip.cpp b/src/mongo/db/pipeline/document_source_skip.cpp
new file mode 100644
index 00000000000..74bf2360ce9
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_skip.cpp
@@ -0,0 +1,99 @@
+/**
+* Copyright (C) 2011 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+ const char DocumentSourceSkip::skipName[] = "$skip";
+
+ DocumentSourceSkip::DocumentSourceSkip(const intrusive_ptr<ExpressionContext> &pTheCtx):
+ skip(0),
+ count(0),
+ pCtx(pTheCtx) {
+ }
+
+ DocumentSourceSkip::~DocumentSourceSkip() {
+ }
+
+ void DocumentSourceSkip::skipper() {
+ if (count == 0) {
+ while (!pSource->eof() && count++ < skip) {
+ pSource->advance();
+ }
+ }
+
+ if (pSource->eof()) {
+ pCurrent.reset();
+ return;
+ }
+
+ pCurrent = pSource->getCurrent();
+ }
+
+ bool DocumentSourceSkip::eof() {
+ skipper();
+ return pSource->eof();
+ }
+
+ bool DocumentSourceSkip::advance() {
+ if (eof()) {
+ pCurrent.reset();
+ return false;
+ }
+
+ pCurrent = pSource->getCurrent();
+ return pSource->advance();
+ }
+
+ intrusive_ptr<Document> DocumentSourceSkip::getCurrent() {
+ skipper();
+ return pCurrent;
+ }
+
+ void DocumentSourceSkip::sourceToBson(BSONObjBuilder *pBuilder) const {
+ pBuilder->append("$skip", skip);
+ }
+
+ intrusive_ptr<DocumentSourceSkip> DocumentSourceSkip::create(
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ intrusive_ptr<DocumentSourceSkip> pSource(
+ new DocumentSourceSkip(pCtx));
+ return pSource;
+ }
+
+ intrusive_ptr<DocumentSource> DocumentSourceSkip::createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ uassert(15972, str::stream() << "the value to " <<
+ skipName << " must be a number", pBsonElement->isNumber());
+
+ intrusive_ptr<DocumentSourceSkip> pSkip(
+ DocumentSourceSkip::create(pCtx));
+
+ pSkip->skip = (int)pBsonElement->numberLong();
+ assert(pSkip->skip > 0); // CW TODO error code
+
+ return pSkip;
+ }
+}
diff --git a/src/mongo/db/pipeline/document_source_sort.cpp b/src/mongo/db/pipeline/document_source_sort.cpp
new file mode 100755
index 00000000000..bf4739af7d1
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_sort.cpp
@@ -0,0 +1,216 @@
+/**
+* Copyright (C) 2011 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/doc_mem_monitor.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+
+namespace mongo {
+ const char DocumentSourceSort::sortName[] = "$sort";
+
+ DocumentSourceSort::~DocumentSourceSort() {
+ }
+
+ bool DocumentSourceSort::eof() {
+ if (!populated)
+ populate();
+
+ return (listIterator == documents.end());
+ }
+
+ bool DocumentSourceSort::advance() {
+ if (!populated)
+ populate();
+
+ assert(listIterator != documents.end());
+
+ ++listIterator;
+ if (listIterator == documents.end()) {
+ pCurrent.reset();
+ count = 0;
+ return false;
+ }
+ pCurrent = listIterator->pDocument;
+
+ return true;
+ }
+
+ intrusive_ptr<Document> DocumentSourceSort::getCurrent() {
+ if (!populated)
+ populate();
+
+ return pCurrent;
+ }
+
+ void DocumentSourceSort::sourceToBson(BSONObjBuilder *pBuilder) const {
+ BSONObjBuilder insides;
+ sortKeyToBson(&insides, false);
+ pBuilder->append(sortName, insides.done());
+ }
+
+ intrusive_ptr<DocumentSourceSort> DocumentSourceSort::create(
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ intrusive_ptr<DocumentSourceSort> pSource(
+ new DocumentSourceSort(pCtx));
+ return pSource;
+ }
+
+ DocumentSourceSort::DocumentSourceSort(
+ const intrusive_ptr<ExpressionContext> &pTheCtx):
+ populated(false),
+ pCtx(pTheCtx) {
+ }
+
+ void DocumentSourceSort::addKey(const string &fieldPath, bool ascending) {
+ intrusive_ptr<ExpressionFieldPath> pE(
+ ExpressionFieldPath::create(fieldPath));
+ vSortKey.push_back(pE);
+ vAscending.push_back(ascending);
+ }
+
+ void DocumentSourceSort::sortKeyToBson(
+ BSONObjBuilder *pBuilder, bool usePrefix) const {
+ /* add the key fields */
+ const size_t n = vSortKey.size();
+ for(size_t i = 0; i < n; ++i) {
+ /* create the "field name" */
+ stringstream ss;
+ vSortKey[i]->writeFieldPath(ss, usePrefix);
+
+ /* append a named integer based on the sort order */
+ pBuilder->append(ss.str(), (vAscending[i] ? 1 : -1));
+ }
+ }
+
+ intrusive_ptr<DocumentSource> DocumentSourceSort::createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ uassert(15973, str::stream() << " the " <<
+ sortName << " key specification must be an object",
+ pBsonElement->type() == Object);
+
+ intrusive_ptr<DocumentSourceSort> pSort(
+ DocumentSourceSort::create(pCtx));
+
+ /* check for then iterate over the sort object */
+ size_t sortKeys = 0;
+ for(BSONObjIterator keyIterator(pBsonElement->Obj().begin());
+ keyIterator.more();) {
+ BSONElement keyField(keyIterator.next());
+ const char *pKeyFieldName = keyField.fieldName();
+ int sortOrder = 0;
+
+ uassert(15974, str::stream() << sortName <<
+ " key ordering must be specified using a number",
+ keyField.isNumber());
+ sortOrder = (int)keyField.numberInt();
+
+ uassert(15975, str::stream() << sortName <<
+ " key ordering must be 1 (for ascending) or -1 (for descending",
+ ((sortOrder == 1) || (sortOrder == -1)));
+
+ pSort->addKey(pKeyFieldName, (sortOrder > 0));
+ ++sortKeys;
+ }
+
+ uassert(15976, str::stream() << sortName <<
+ " must have at least one sort key", (sortKeys > 0));
+
+ return pSort;
+ }
+
+ void DocumentSourceSort::populate() {
+ /* make sure we've got a sort key */
+ assert(vSortKey.size());
+
+ /* track and warn about how much physical memory has been used */
+ DocMemMonitor dmm(this);
+
+ /* pull everything from the underlying source */
+ for(bool hasNext = !pSource->eof(); hasNext;
+ hasNext = pSource->advance()) {
+ intrusive_ptr<Document> pDocument(pSource->getCurrent());
+ documents.push_back(Carrier(this, pDocument));
+
+ dmm.addToTotal(pDocument->getApproximateSize());
+ }
+
+ /* sort the list */
+ documents.sort(Carrier::lessThan);
+
+ /* start the sort iterator */
+ listIterator = documents.begin();
+
+ if (listIterator != documents.end())
+ pCurrent = listIterator->pDocument;
+ populated = true;
+ }
+
+ int DocumentSourceSort::compare(
+ const intrusive_ptr<Document> &pL, const intrusive_ptr<Document> &pR) {
+
+ /*
+ populate() already checked that there is a non-empty sort key,
+ so we shouldn't have to worry about that here.
+
+ However, the tricky part is what to do is none of the sort keys are
+ present. In this case, consider the document less.
+ */
+ const size_t n = vSortKey.size();
+ for(size_t i = 0; i < n; ++i) {
+ /* evaluate the sort keys */
+ ExpressionFieldPath *pE = vSortKey[i].get();
+ intrusive_ptr<const Value> pLeft(pE->evaluate(pL));
+ intrusive_ptr<const Value> pRight(pE->evaluate(pR));
+
+ /*
+ Compare the two values; if they differ, return. If they are
+ the same, move on to the next key.
+ */
+ int cmp = Value::compare(pLeft, pRight);
+ if (cmp) {
+ /* if necessary, adjust the return value by the key ordering */
+ if (!vAscending[i])
+ cmp = -cmp;
+
+ return cmp;
+ }
+ }
+
+ /*
+ If we got here, everything matched (or didn't exist), so we'll
+ consider the documents equal for purposes of this sort.
+ */
+ return 0;
+ }
+
+ bool DocumentSourceSort::Carrier::lessThan(
+ const Carrier &rL, const Carrier &rR) {
+ /* make sure these aren't from different lists */
+ assert(rL.pSort == rR.pSort);
+
+ /* compare the documents according to the sort key */
+ return (rL.pSort->compare(rL.pDocument, rR.pDocument) < 0);
+ }
+}
diff --git a/src/mongo/db/pipeline/document_source_unwind.cpp b/src/mongo/db/pipeline/document_source_unwind.cpp
new file mode 100755
index 00000000000..bb231451113
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_unwind.cpp
@@ -0,0 +1,234 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+ const char DocumentSourceUnwind::unwindName[] = "$unwind";
+
+ DocumentSourceUnwind::~DocumentSourceUnwind() {
+ }
+
+ DocumentSourceUnwind::DocumentSourceUnwind():
+ unwindPath(),
+ pNoUnwindDocument(),
+ pUnwindArray(),
+ pUnwinder(),
+ pUnwindValue() {
+ }
+
+ bool DocumentSourceUnwind::eof() {
+ /*
+ If we're unwinding an array, and there are more elements, then we
+ can return more documents.
+ */
+ if (pUnwinder.get() && pUnwinder->more())
+ return false;
+
+ return pSource->eof();
+ }
+
+ bool DocumentSourceUnwind::advance() {
+ if (pUnwinder.get() && pUnwinder->more()) {
+ pUnwindValue = pUnwinder->next();
+ return true;
+ }
+
+ /* release the last document and advance */
+ resetArray();
+ return pSource->advance();
+ }
+
+ intrusive_ptr<Document> DocumentSourceUnwind::getCurrent() {
+ if (!pNoUnwindDocument.get()) {
+ intrusive_ptr<Document> pInDocument(pSource->getCurrent());
+
+ /* create the result document */
+ pNoUnwindDocument = pInDocument;
+ fieldIndex.clear();
+
+ /*
+ First we'll look to see if the path is there. If it isn't,
+ we'll pass this document through. If it is, we record the
+ indexes of the fields down the field path so that we can
+ quickly replace them as we clone the documents along the
+ field path.
+
+ We have to clone all the documents along the field path so
+ that we don't share the end value across documents that have
+ come out of this pipeline operator.
+ */
+ intrusive_ptr<Document> pCurrent(pInDocument);
+ const size_t pathLength = unwindPath.getPathLength();
+ for(size_t i = 0; i < pathLength; ++i) {
+ size_t idx = pCurrent->getFieldIndex(
+ unwindPath.getFieldName(i));
+ if (idx == pCurrent->getFieldCount() ) {
+ /* this document doesn't contain the target field */
+ resetArray();
+ return pInDocument;
+ break;
+ }
+
+ fieldIndex.push_back(idx);
+ Document::FieldPair fp(pCurrent->getField(idx));
+ intrusive_ptr<const Value> pPathValue(fp.second);
+ if (i < pathLength - 1) {
+ if (pPathValue->getType() != Object) {
+ /* can't walk down the field path */
+ resetArray();
+ uassert(15977, str::stream() << unwindName <<
+ ": cannot traverse field path past scalar value for \"" <<
+ fp.first << "\"", false);
+ break;
+ }
+
+ /* move down the object tree */
+ pCurrent = pPathValue->getDocument();
+ }
+ else /* (i == pathLength - 1) */ {
+ if (pPathValue->getType() != Array) {
+ /* last item on path must be an array to unwind */
+ resetArray();
+ uassert(15978, str::stream() << unwindName <<
+ ": value at end of field path must be an array",
+ false);
+ break;
+ }
+
+ /* keep track of the array we're unwinding */
+ pUnwindArray = pPathValue;
+ if (pUnwindArray->getArrayLength() == 0) {
+ /*
+ The $unwind of an empty array is a NULL value. If we
+ encounter this, use the non-unwind path, but replace
+ pOutField with a null.
+
+ Make sure unwind value is clear so the array is
+ removed.
+ */
+ pUnwindValue.reset();
+ intrusive_ptr<Document> pClone(clonePath());
+ resetArray();
+ return pClone;
+ }
+
+ /* get the iterator we'll use to unwind the array */
+ pUnwinder = pUnwindArray->getArray();
+ assert(pUnwinder->more()); // we just checked above...
+ pUnwindValue = pUnwinder->next();
+ }
+ }
+ }
+
+ /*
+ If we're unwinding a field, create an alternate document. In the
+ alternate (clone), replace the unwound array field with the element
+ at the appropriate index.
+ */
+ if (pUnwindArray.get()) {
+ /* clone the document with an array we're unwinding */
+ intrusive_ptr<Document> pUnwindDocument(clonePath());
+
+ return pUnwindDocument;
+ }
+
+ return pNoUnwindDocument;
+ }
+
+ intrusive_ptr<Document> DocumentSourceUnwind::clonePath() const {
+ /*
+ For this to be valid, we must already have pNoUnwindDocument set,
+ and have set up the vector of indices for that document in fieldIndex.
+ */
+ assert(pNoUnwindDocument.get());
+ assert(pUnwinder.get());
+
+ intrusive_ptr<Document> pClone(pNoUnwindDocument->clone());
+ intrusive_ptr<Document> pCurrent(pClone);
+ const size_t n = fieldIndex.size();
+ assert(n);
+ for(size_t i = 0; i < n; ++i) {
+ const size_t fi = fieldIndex[i];
+ Document::FieldPair fp(pCurrent->getField(fi));
+ if (i + 1 < n) {
+ /*
+ For every object in the path but the last, clone it and
+ continue on down.
+ */
+ intrusive_ptr<Document> pNext(
+ fp.second->getDocument()->clone());
+ pCurrent->setField(fi, fp.first, Value::createDocument(pNext));
+ pCurrent = pNext;
+ }
+ else {
+ /* for the last, subsitute the next unwound value */
+ pCurrent->setField(fi, fp.first, pUnwindValue);
+ }
+ }
+
+ return pClone;
+ }
+
+ void DocumentSourceUnwind::sourceToBson(BSONObjBuilder *pBuilder) const {
+ pBuilder->append(unwindName, unwindPath.getPath(true));
+ }
+
+ intrusive_ptr<DocumentSourceUnwind> DocumentSourceUnwind::create() {
+ intrusive_ptr<DocumentSourceUnwind> pSource(
+ new DocumentSourceUnwind());
+ return pSource;
+ }
+
+ void DocumentSourceUnwind::unwindField(const FieldPath &rFieldPath) {
+ /* can't set more than one unwind field */
+ uassert(15979, str::stream() << unwindName <<
+ "can't unwind more than one path at once",
+ !unwindPath.getPathLength());
+
+ uassert(15980, "the path of the field to unwind cannot be empty",
+ false);
+
+ /* record the field path */
+ unwindPath = rFieldPath;
+ }
+
+ intrusive_ptr<DocumentSource> DocumentSourceUnwind::createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ /*
+ The value of $unwind should just be a field path.
+ */
+ uassert(15981, str::stream() << "the " << unwindName <<
+ " field path must be specified as a string",
+ pBsonElement->type() == String);
+
+ string prefixedPathString(pBsonElement->String());
+ string pathString(Expression::removeFieldPrefix(prefixedPathString));
+ intrusive_ptr<DocumentSourceUnwind> pUnwind(
+ DocumentSourceUnwind::create());
+ pUnwind->unwindPath = FieldPath(pathString);
+
+ return pUnwind;
+ }
+}
diff --git a/src/mongo/db/pipeline/expression.cpp b/src/mongo/db/pipeline/expression.cpp
new file mode 100755
index 00000000000..b3caefcf899
--- /dev/null
+++ b/src/mongo/db/pipeline/expression.cpp
@@ -0,0 +1,2815 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/expression.h"
+
+#include <cstdio>
+#include "db/jsobj.h"
+#include "db/pipeline/builder.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+#include "util/mongoutils/str.h"
+
+namespace mongo {
+ using namespace mongoutils;
+
+ /* --------------------------- Expression ------------------------------ */
+
+ void Expression::toMatcherBson(
+ BSONObjBuilder *pBuilder, unsigned depth) const {
+ assert(false && "Expression::toMatcherBson()");
+ }
+
+ Expression::ObjectCtx::ObjectCtx(int theOptions):
+ options(theOptions),
+ unwindField() {
+ }
+
+ void Expression::ObjectCtx::unwind(string fieldName) {
+ assert(unwindOk());
+ assert(!unwindUsed());
+ assert(fieldName.size());
+ unwindField = fieldName;
+ }
+
+ bool Expression::ObjectCtx::documentOk() const {
+ return ((options & DOCUMENT_OK) != 0);
+ }
+
+ const char Expression::unwindName[] = "$unwind";
+
+ string Expression::removeFieldPrefix(const string &prefixedField) {
+ const char *pPrefixedField = prefixedField.c_str();
+ uassert(15982, str::stream() <<
+ "field path references must be prefixed with a '$' (\"" <<
+ prefixedField << "\"", pPrefixedField[0] == '$');
+
+ return string(pPrefixedField + 1);
+ }
+
+ intrusive_ptr<Expression> Expression::parseObject(
+ BSONElement *pBsonElement, ObjectCtx *pCtx) {
+ /*
+ An object expression can take any of the following forms:
+
+ f0: {f1: ..., f2: ..., f3: ...}
+ f0: {$operator:[operand1, operand2, ...]}
+ f0: {$unwind:"fieldpath"}
+
+ We handle $unwind as a special case, because this is done by the
+ projection source. For any other expression, we hand over control to
+ code that parses the expression and returns an expression.
+ */
+
+ intrusive_ptr<Expression> pExpression; // the result
+ intrusive_ptr<ExpressionObject> pExpressionObject; // alt result
+ int isOp = -1; /* -1 -> unknown, 0 -> not an operator, 1 -> operator */
+ enum { UNKNOWN, NOTOPERATOR, OPERATOR } kind = UNKNOWN;
+
+ BSONObj obj(pBsonElement->Obj());
+ BSONObjIterator iter(obj);
+ for(size_t fieldCount = 0; iter.more(); ++fieldCount) {
+ BSONElement fieldElement(iter.next());
+ const char *pFieldName = fieldElement.fieldName();
+
+ if (pFieldName[0] == '$') {
+ uassert(15983, str::stream() <<
+ "the operator must be the only field in a pipeline object (at \""
+ << pFieldName << "\"",
+ fieldCount == 0);
+
+ /* we've determined this "object" is an operator expression */
+ isOp = 1;
+ kind = OPERATOR;
+
+ if (strcmp(pFieldName, unwindName) != 0) {
+ pExpression = parseExpression(pFieldName, &fieldElement);
+ }
+ else {
+ assert(pCtx->unwindOk());
+ // CW TODO error: it's not OK to unwind in this context
+
+ assert(!pCtx->unwindUsed());
+ // CW TODO error: this projection already has an unwind
+
+ assert(fieldElement.type() == String);
+ // CW TODO $unwind operand must be single field name
+
+ string fieldPath(removeFieldPrefix(fieldElement.String()));
+ pExpression = ExpressionFieldPath::create(fieldPath);
+ pCtx->unwind(fieldPath);
+ }
+ }
+ else {
+ uassert(15984, str::stream() << "this object is already an operator expression, and can't be used as a document expression (at \"" <<
+ pFieldName << "\")",
+ isOp != 1);
+ uassert(15990, str::stream() << "this object is already an operator expression, and can't be used as a document expression (at \"" <<
+ pFieldName << "\")",
+ kind != OPERATOR);
+
+ /* if it's our first time, create the document expression */
+ if (!pExpression.get()) {
+ assert(pCtx->documentOk());
+ // CW TODO error: document not allowed in this context
+
+ pExpressionObject = ExpressionObject::create();
+ pExpression = pExpressionObject;
+
+ /* this "object" is not an operator expression */
+ isOp = 0;
+ kind = NOTOPERATOR;
+ }
+
+ BSONType fieldType = fieldElement.type();
+ string fieldName(pFieldName);
+ if (fieldType == Object) {
+ /* it's a nested document */
+ ObjectCtx oCtx(
+ (pCtx->documentOk() ? ObjectCtx::DOCUMENT_OK : 0));
+ intrusive_ptr<Expression> pNested(
+ parseObject(&fieldElement, &oCtx));
+ pExpressionObject->addField(fieldName, pNested);
+ }
+ else if (fieldType == String) {
+ /* it's a renamed field */
+ // CW TODO could also be a constant
+ intrusive_ptr<Expression> pPath(
+ ExpressionFieldPath::create(
+ removeFieldPrefix(fieldElement.String())));
+ pExpressionObject->addField(fieldName, pPath);
+ }
+ else if (fieldType == NumberDouble) {
+ /* it's an inclusion specification */
+ int inclusion = static_cast<int>(fieldElement.Double());
+ if (inclusion == 0)
+ pExpressionObject->excludePath(fieldName);
+ else if (inclusion == 1)
+ pExpressionObject->includePath(fieldName);
+ else
+ uassert(15991, str::stream() <<
+ "\"" << fieldName <<
+ "\" numeric inclusion or exclusion must be 1 or 0 (or boolean)",
+ false);
+ }
+ else if (fieldType == Bool) {
+ bool inclusion = fieldElement.Bool();
+ if (!inclusion)
+ pExpressionObject->excludePath(fieldName);
+ else
+ pExpressionObject->includePath(fieldName);
+ }
+ else { /* nothing else is allowed */
+ uassert(15992, str::stream() <<
+ "disallowed field type " << fieldType <<
+ " in object expression (at \"" <<
+ fieldName << "\")", false);
+ }
+ }
+ }
+
+ return pExpression;
+ }
+
+
+ struct OpDesc {
+ const char *pName;
+ intrusive_ptr<ExpressionNary> (*pFactory)(void);
+ };
+
+ static int OpDescCmp(const void *pL, const void *pR) {
+ return strcmp(((const OpDesc *)pL)->pName, ((const OpDesc *)pR)->pName);
+ }
+
+ /*
+ Keep these sorted alphabetically so we can bsearch() them using
+ OpDescCmp() above.
+ */
+ static const OpDesc OpTable[] = {
+ {"$add", ExpressionAdd::create},
+ {"$and", ExpressionAnd::create},
+ {"$cmp", ExpressionCompare::createCmp},
+ {"$cond", ExpressionCond::create},
+ {"$const", ExpressionNoOp::create},
+ {"$dayOfMonth", ExpressionDayOfMonth::create},
+ {"$dayOfWeek", ExpressionDayOfWeek::create},
+ {"$dayOfYear", ExpressionDayOfYear::create},
+ {"$divide", ExpressionDivide::create},
+ {"$eq", ExpressionCompare::createEq},
+ {"$gt", ExpressionCompare::createGt},
+ {"$gte", ExpressionCompare::createGte},
+ {"$hour", ExpressionHour::create},
+ {"$ifNull", ExpressionIfNull::create},
+ {"$lt", ExpressionCompare::createLt},
+ {"$lte", ExpressionCompare::createLte},
+ {"$minute", ExpressionMinute::create},
+ {"$mod", ExpressionMod::create},
+ {"$month", ExpressionMonth::create},
+ {"$multiply", ExpressionMultiply::create},
+ {"$ne", ExpressionCompare::createNe},
+ {"$not", ExpressionNot::create},
+ {"$or", ExpressionOr::create},
+ {"$second", ExpressionSecond::create},
+ {"$strcasecmp", ExpressionStrcasecmp::create},
+ {"$substr", ExpressionSubstr::create},
+ {"$subtract", ExpressionSubtract::create},
+ {"$toLower", ExpressionToLower::create},
+ {"$toUpper", ExpressionToUpper::create},
+ {"$week", ExpressionWeek::create},
+ {"$year", ExpressionYear::create},
+ };
+
+ static const size_t NOp = sizeof(OpTable)/sizeof(OpTable[0]);
+
+ intrusive_ptr<Expression> Expression::parseExpression(
+ const char *pOpName, BSONElement *pBsonElement) {
+ /* look for the specified operator */
+ OpDesc key;
+ key.pName = pOpName;
+ const OpDesc *pOp = (const OpDesc *)bsearch(
+ &key, OpTable, NOp, sizeof(OpDesc), OpDescCmp);
+
+ uassert(15999, str::stream() << "invalid operator \"" <<
+ pOpName << "\"", pOp);
+
+ /* make the expression node */
+ intrusive_ptr<ExpressionNary> pExpression((*pOp->pFactory)());
+
+ /* add the operands to the expression node */
+ BSONType elementType = pBsonElement->type();
+ if (elementType == Object) {
+ /* the operator must be unary and accept an object argument */
+ BSONObj objOperand(pBsonElement->Obj());
+ ObjectCtx oCtx(ObjectCtx::DOCUMENT_OK);
+ intrusive_ptr<Expression> pOperand(
+ Expression::parseObject(pBsonElement, &oCtx));
+ pExpression->addOperand(pOperand);
+ }
+ else if (elementType == Array) {
+ /* multiple operands - an n-ary operator */
+ vector<BSONElement> bsonArray(pBsonElement->Array());
+ const size_t n = bsonArray.size();
+ for(size_t i = 0; i < n; ++i) {
+ BSONElement *pBsonOperand = &bsonArray[i];
+ intrusive_ptr<Expression> pOperand(
+ Expression::parseOperand(pBsonOperand));
+ pExpression->addOperand(pOperand);
+ }
+ }
+ else { /* assume it's an atomic operand */
+ intrusive_ptr<Expression> pOperand(
+ Expression::parseOperand(pBsonElement));
+ pExpression->addOperand(pOperand);
+ }
+
+ return pExpression;
+ }
+
+ intrusive_ptr<Expression> Expression::parseOperand(BSONElement *pBsonElement) {
+ BSONType type = pBsonElement->type();
+
+ switch(type) {
+ case String: {
+ /*
+ This could be a field path, or it could be a constant
+ string.
+
+ We make a copy of the BSONElement reader so we can read its
+ value without advancing its state, in case we need to read it
+ again in the constant code path.
+ */
+ BSONElement opCopy(*pBsonElement);
+ string value(opCopy.String());
+
+ /* check for a field path */
+ if (value[0] != '$')
+ goto ExpectConstant; // assume plain string constant
+
+ /* if we got here, this is a field path expression */
+ string fieldPath(removeFieldPrefix(value));
+ intrusive_ptr<Expression> pFieldExpr(
+ ExpressionFieldPath::create(fieldPath));
+ return pFieldExpr;
+ }
+
+ case Object: {
+ ObjectCtx oCtx(ObjectCtx::DOCUMENT_OK);
+ intrusive_ptr<Expression> pSubExpression(
+ Expression::parseObject(pBsonElement, &oCtx));
+ return pSubExpression;
+ }
+
+ default:
+ ExpectConstant: {
+ intrusive_ptr<Expression> pOperand(
+ ExpressionConstant::createFromBsonElement(pBsonElement));
+ return pOperand;
+ }
+
+ } // switch(type)
+
+ /* NOTREACHED */
+ assert(false);
+ return intrusive_ptr<Expression>();
+ }
+
+ /* ------------------------- ExpressionAdd ----------------------------- */
+
+ ExpressionAdd::~ExpressionAdd() {
+ }
+
+ intrusive_ptr<Expression> ExpressionAdd::optimize() {
+ intrusive_ptr<Expression> pE(ExpressionNary::optimize());
+ ExpressionAdd *pA = dynamic_cast<ExpressionAdd *>(pE.get());
+ if (pA) {
+ /* don't create a circular reference */
+ if (pA != this)
+ pA->pAdd = this;
+ }
+
+ return pE;
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionAdd::create() {
+ intrusive_ptr<ExpressionAdd> pExpression(new ExpressionAdd());
+ return pExpression;
+ }
+
+ ExpressionAdd::ExpressionAdd():
+ ExpressionNary(),
+ useOriginal(false) {
+ }
+
+ intrusive_ptr<const Value> ExpressionAdd::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ unsigned stringCount = 0;
+ unsigned nonConstStringCount = 0;
+ unsigned dateCount = 0;
+ const size_t n = vpOperand.size();
+ vector<intrusive_ptr<const Value> > vpValue; /* evaluated operands */
+
+ /* use the original, if we've been told to do so */
+ if (useOriginal) {
+ return pAdd->evaluate(pDocument);
+ }
+
+ for (size_t i = 0; i < n; ++i) {
+ intrusive_ptr<const Value> pValue(
+ vpOperand[i]->evaluate(pDocument));
+ vpValue.push_back(pValue);
+
+ BSONType valueType = pValue->getType();
+ if (valueType == String) {
+ ++stringCount;
+ if (!dynamic_cast<ExpressionConstant *>(vpOperand[i].get()))
+ ++nonConstStringCount;
+ }
+ else if (valueType == Date)
+ ++dateCount;
+ }
+
+ /*
+ We don't allow adding two dates because it doesn't make sense
+ especially since they are in epoch time. However, if there is a
+ string present then we would be appending the dates to a string so
+ having many would not be not a problem.
+ */
+ if ((dateCount > 1) && !stringCount) {
+ uassert(16000, "can't add two dates together", false);
+ return Value::getNull();
+ }
+
+ /*
+ If there are non-constant strings, and we've got a copy of the
+ original, then use that from this point forward. This is necessary
+ to keep the order of strings the same for string concatenation;
+ constant-folding would violate the order preservation.
+
+ This is a one-way conversion we do if we see one of these. It is
+ possible that these could vary from document to document, but any
+ sane schema probably isn't going to do that, so once we see a string,
+ we can probably assume they're going to be strings all the way down.
+ */
+ if (nonConstStringCount && pAdd.get()) {
+ useOriginal = true;
+ return pAdd->evaluate(pDocument);
+ }
+
+ if (stringCount) {
+ stringstream stringTotal;
+ for (size_t i = 0; i < n; ++i) {
+ intrusive_ptr<const Value> pValue(vpValue[i]);
+ stringTotal << pValue->coerceToString();
+ }
+
+ return Value::createString(stringTotal.str());
+ }
+
+ if (dateCount) {
+ long long dateTotal = 0;
+ for (size_t i = 0; i < n; ++i) {
+ intrusive_ptr<const Value> pValue(vpValue[i]);
+ if (pValue->getType() == Date)
+ dateTotal += pValue->coerceToDate();
+ else
+ dateTotal += static_cast<long long>(pValue->coerceToDouble()*24*60*60*1000);
+ }
+
+ return Value::createDate(Date_t(dateTotal));
+ }
+
+ /*
+ We'll try to return the narrowest possible result value. To do that
+ without creating intermediate Values, do the arithmetic for double
+ and integral types in parallel, tracking the current narrowest
+ type.
+ */
+ double doubleTotal = 0;
+ long long longTotal = 0;
+ BSONType totalType = NumberInt;
+ for(size_t i = 0; i < n; ++i) {
+ intrusive_ptr<const Value> pValue(vpValue[i]);
+
+ totalType = Value::getWidestNumeric(totalType, pValue->getType());
+ doubleTotal += pValue->coerceToDouble();
+ longTotal += pValue->coerceToLong();
+ }
+
+ if (totalType == NumberDouble)
+ return Value::createDouble(doubleTotal);
+ if (totalType == NumberLong)
+ return Value::createLong(longTotal);
+ return Value::createInt((int)longTotal);
+ }
+
+ const char *ExpressionAdd::getOpName() const {
+ return "$add";
+ }
+
+ intrusive_ptr<ExpressionNary> (*ExpressionAdd::getFactory() const)() {
+ return ExpressionAdd::create;
+ }
+
+ void ExpressionAdd::toBson(
+ BSONObjBuilder *pBuilder, const char *pOpName, unsigned depth) const {
+
+ if (pAdd)
+ pAdd->toBson(pBuilder, pOpName, depth);
+ else
+ ExpressionNary::toBson(pBuilder, pOpName, depth);
+ }
+
+
+ /* ------------------------- ExpressionAnd ----------------------------- */
+
+ ExpressionAnd::~ExpressionAnd() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionAnd::create() {
+ intrusive_ptr<ExpressionNary> pExpression(new ExpressionAnd());
+ return pExpression;
+ }
+
+ ExpressionAnd::ExpressionAnd():
+ ExpressionNary() {
+ }
+
+ intrusive_ptr<Expression> ExpressionAnd::optimize() {
+ /* optimize the conjunction as much as possible */
+ intrusive_ptr<Expression> pE(ExpressionNary::optimize());
+
+ /* if the result isn't a conjunction, we can't do anything */
+ ExpressionAnd *pAnd = dynamic_cast<ExpressionAnd *>(pE.get());
+ if (!pAnd)
+ return pE;
+
+ /*
+ Check the last argument on the result; if it's not constant (as
+ promised by ExpressionNary::optimize(),) then there's nothing
+ we can do.
+ */
+ const size_t n = pAnd->vpOperand.size();
+ intrusive_ptr<Expression> pLast(pAnd->vpOperand[n - 1]);
+ const ExpressionConstant *pConst =
+ dynamic_cast<ExpressionConstant *>(pLast.get());
+ if (!pConst)
+ return pE;
+
+ /*
+ Evaluate and coerce the last argument to a boolean. If it's false,
+ then we can replace this entire expression.
+ */
+ bool last = pLast->evaluate(intrusive_ptr<Document>())->coerceToBool();
+ if (!last) {
+ intrusive_ptr<ExpressionConstant> pFinal(
+ ExpressionConstant::create(Value::getFalse()));
+ return pFinal;
+ }
+
+ /*
+ If we got here, the final operand was true, so we don't need it
+ anymore. If there was only one other operand, we don't need the
+ conjunction either. Note we still need to keep the promise that
+ the result will be a boolean.
+ */
+ if (n == 2) {
+ intrusive_ptr<Expression> pFinal(
+ ExpressionCoerceToBool::create(pAnd->vpOperand[0]));
+ return pFinal;
+ }
+
+ /*
+ Remove the final "true" value, and return the new expression.
+
+ CW TODO:
+ Note that because of any implicit conversions, we may need to
+ apply an implicit boolean conversion.
+ */
+ pAnd->vpOperand.resize(n - 1);
+ return pE;
+ }
+
+ intrusive_ptr<const Value> ExpressionAnd::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ const size_t n = vpOperand.size();
+ for(size_t i = 0; i < n; ++i) {
+ intrusive_ptr<const Value> pValue(vpOperand[i]->evaluate(pDocument));
+ if (!pValue->coerceToBool())
+ return Value::getFalse();
+ }
+
+ return Value::getTrue();
+ }
+
+ const char *ExpressionAnd::getOpName() const {
+ return "$and";
+ }
+
+ void ExpressionAnd::toMatcherBson(
+ BSONObjBuilder *pBuilder, unsigned depth) const {
+ /*
+ There are two patterns we can handle:
+ (1) one or two comparisons on the same field: { a:{$gte:3, $lt:7} }
+ (2) multiple field comparisons: {a:7, b:{$lte:6}, c:2}
+ This can be recognized as a conjunction of a set of range
+ expressions. Direct equality is a degenerate range expression;
+ range expressions can be open-ended.
+ */
+ assert(false && "unimplemented");
+ }
+
+ intrusive_ptr<ExpressionNary> (*ExpressionAnd::getFactory() const)() {
+ return ExpressionAnd::create;
+ }
+
+ /* -------------------- ExpressionCoerceToBool ------------------------- */
+
+ ExpressionCoerceToBool::~ExpressionCoerceToBool() {
+ }
+
+ intrusive_ptr<ExpressionCoerceToBool> ExpressionCoerceToBool::create(
+ const intrusive_ptr<Expression> &pExpression) {
+ intrusive_ptr<ExpressionCoerceToBool> pNew(
+ new ExpressionCoerceToBool(pExpression));
+ return pNew;
+ }
+
+ ExpressionCoerceToBool::ExpressionCoerceToBool(
+ const intrusive_ptr<Expression> &pTheExpression):
+ Expression(),
+ pExpression(pTheExpression) {
+ }
+
+ intrusive_ptr<Expression> ExpressionCoerceToBool::optimize() {
+ /* optimize the operand */
+ pExpression = pExpression->optimize();
+
+ /* if the operand already produces a boolean, then we don't need this */
+ /* LATER - Expression to support a "typeof" query? */
+ Expression *pE = pExpression.get();
+ if (dynamic_cast<ExpressionAnd *>(pE) ||
+ dynamic_cast<ExpressionOr *>(pE) ||
+ dynamic_cast<ExpressionNot *>(pE) ||
+ dynamic_cast<ExpressionCoerceToBool *>(pE))
+ return pExpression;
+
+ return intrusive_ptr<Expression>(this);
+ }
+
+ intrusive_ptr<const Value> ExpressionCoerceToBool::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+
+ intrusive_ptr<const Value> pResult(pExpression->evaluate(pDocument));
+ bool b = pResult->coerceToBool();
+ if (b)
+ return Value::getTrue();
+ return Value::getFalse();
+ }
+
+ void ExpressionCoerceToBool::addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+ assert(false && "not possible"); // no equivalent of this
+ }
+
+ void ExpressionCoerceToBool::addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const {
+ assert(false && "not possible"); // no equivalent of this
+ }
+
+ /* ----------------------- ExpressionCompare --------------------------- */
+
+ ExpressionCompare::~ExpressionCompare() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionCompare::createEq() {
+ intrusive_ptr<ExpressionCompare> pExpression(
+ new ExpressionCompare(EQ));
+ return pExpression;
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionCompare::createNe() {
+ intrusive_ptr<ExpressionCompare> pExpression(
+ new ExpressionCompare(NE));
+ return pExpression;
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionCompare::createGt() {
+ intrusive_ptr<ExpressionCompare> pExpression(
+ new ExpressionCompare(GT));
+ return pExpression;
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionCompare::createGte() {
+ intrusive_ptr<ExpressionCompare> pExpression(
+ new ExpressionCompare(GTE));
+ return pExpression;
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionCompare::createLt() {
+ intrusive_ptr<ExpressionCompare> pExpression(
+ new ExpressionCompare(LT));
+ return pExpression;
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionCompare::createLte() {
+ intrusive_ptr<ExpressionCompare> pExpression(
+ new ExpressionCompare(LTE));
+ return pExpression;
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionCompare::createCmp() {
+ intrusive_ptr<ExpressionCompare> pExpression(
+ new ExpressionCompare(CMP));
+ return pExpression;
+ }
+
+ ExpressionCompare::ExpressionCompare(CmpOp theCmpOp):
+ ExpressionNary(),
+ cmpOp(theCmpOp) {
+ }
+
+ void ExpressionCompare::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(2);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ /*
+ Lookup table for truth value returns
+ */
+ struct CmpLookup {
+ bool truthValue[3]; /* truth value for -1, 0, 1 */
+ Expression::CmpOp reverse; /* reverse comparison operator */
+ char name[5]; /* string name (w/trailing '\0') */
+ };
+ static const CmpLookup cmpLookup[7] = {
+ /* -1 0 1 reverse name */
+ /* EQ */ { { false, true, false }, Expression::EQ, "$eq" },
+ /* NE */ { { true, false, true }, Expression::NE, "$ne" },
+ /* GT */ { { false, false, true }, Expression::LTE, "$gt" },
+ /* GTE */ { { false, true, true }, Expression::LT, "$gte" },
+ /* LT */ { { true, false, false }, Expression::GTE, "$lt" },
+ /* LTE */ { { true, true, false }, Expression::GT, "$lte" },
+ /* CMP */ { { false, false, false }, Expression::CMP, "$cmp" },
+ };
+
+ intrusive_ptr<Expression> ExpressionCompare::optimize() {
+ /* first optimize the comparison operands */
+ intrusive_ptr<Expression> pE(ExpressionNary::optimize());
+
+ /*
+ If the result of optimization is no longer a comparison, there's
+ nothing more we can do.
+ */
+ ExpressionCompare *pCmp = dynamic_cast<ExpressionCompare *>(pE.get());
+ if (!pCmp)
+ return pE;
+
+ /* check to see if optimizing comparison operator is supported */
+ CmpOp newOp = pCmp->cmpOp;
+ if (newOp == CMP)
+ return pE; // not reversible: there's nothing more we can do
+
+ /*
+ There's one localized optimization we recognize: a comparison
+ between a field and a constant. If we recognize that pattern,
+ replace it with an ExpressionFieldRange.
+
+ When looking for this pattern, note that the operands could appear
+ in any order. If we need to reverse the sense of the comparison to
+ put it into the required canonical form, do so.
+ */
+ intrusive_ptr<Expression> pLeft(pCmp->vpOperand[0]);
+ intrusive_ptr<Expression> pRight(pCmp->vpOperand[1]);
+ intrusive_ptr<ExpressionFieldPath> pFieldPath(
+ dynamic_pointer_cast<ExpressionFieldPath>(pLeft));
+ intrusive_ptr<ExpressionConstant> pConstant;
+ if (pFieldPath.get()) {
+ pConstant = dynamic_pointer_cast<ExpressionConstant>(pRight);
+ if (!pConstant.get())
+ return pE; // there's nothing more we can do
+ }
+ else {
+ /* if the first operand wasn't a path, see if it's a constant */
+ pConstant = dynamic_pointer_cast<ExpressionConstant>(pLeft);
+ if (!pConstant.get())
+ return pE; // there's nothing more we can do
+
+ /* the left operand was a constant; see if the right is a path */
+ pFieldPath = dynamic_pointer_cast<ExpressionFieldPath>(pRight);
+ if (!pFieldPath.get())
+ return pE; // there's nothing more we can do
+
+ /* these were not in canonical order, so reverse the sense */
+ newOp = cmpLookup[newOp].reverse;
+ }
+
+ return ExpressionFieldRange::create(
+ pFieldPath, newOp, pConstant->getValue());
+ }
+
+ intrusive_ptr<const Value> ExpressionCompare::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(2);
+ intrusive_ptr<const Value> pLeft(vpOperand[0]->evaluate(pDocument));
+ intrusive_ptr<const Value> pRight(vpOperand[1]->evaluate(pDocument));
+
+ BSONType leftType = pLeft->getType();
+ BSONType rightType = pRight->getType();
+ uassert(15994, str::stream() << getOpName() <<
+ ": no automatic conversion for types " <<
+ leftType << " and " << rightType,
+ leftType == rightType);
+ // CW TODO at least for now. later, handle automatic conversions
+
+ int cmp = 0;
+ switch(leftType) {
+ case NumberDouble: {
+ double left = pLeft->getDouble();
+ double right = pRight->getDouble();
+
+ if (left < right)
+ cmp = -1;
+ else if (left > right)
+ cmp = 1;
+ break;
+ }
+
+ case NumberInt: {
+ int left = pLeft->getInt();
+ int right = pRight->getInt();
+
+ if (left < right)
+ cmp = -1;
+ else if (left > right)
+ cmp = 1;
+ break;
+ }
+
+ case String: {
+ string left(pLeft->getString());
+ string right(pRight->getString());
+ cmp = signum(left.compare(right));
+ break;
+ }
+
+ default:
+ uassert(15995, str::stream() <<
+ "can't compare values of type " << leftType, false);
+ break;
+ }
+
+ if (cmpOp == CMP) {
+ switch(cmp) {
+ case -1:
+ return Value::getMinusOne();
+ case 0:
+ return Value::getZero();
+ case 1:
+ return Value::getOne();
+
+ default:
+ assert(false); // CW TODO internal error
+ return Value::getNull();
+ }
+ }
+
+ bool returnValue = cmpLookup[cmpOp].truthValue[cmp + 1];
+ if (returnValue)
+ return Value::getTrue();
+ return Value::getFalse();
+ }
+
+ const char *ExpressionCompare::getOpName() const {
+ return cmpLookup[cmpOp].name;
+ }
+
+ /* ----------------------- ExpressionCond ------------------------------ */
+
+ ExpressionCond::~ExpressionCond() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionCond::create() {
+ intrusive_ptr<ExpressionCond> pExpression(new ExpressionCond());
+ return pExpression;
+ }
+
+ ExpressionCond::ExpressionCond():
+ ExpressionNary() {
+ }
+
+ void ExpressionCond::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(3);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionCond::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(3);
+ intrusive_ptr<const Value> pCond(vpOperand[0]->evaluate(pDocument));
+ int idx = pCond->coerceToBool() ? 1 : 2;
+ return vpOperand[idx]->evaluate(pDocument);
+ }
+
+ const char *ExpressionCond::getOpName() const {
+ return "$cond";
+ }
+
+ /* ---------------------- ExpressionConstant --------------------------- */
+
+ ExpressionConstant::~ExpressionConstant() {
+ }
+
+ intrusive_ptr<ExpressionConstant> ExpressionConstant::createFromBsonElement(
+ BSONElement *pBsonElement) {
+ intrusive_ptr<ExpressionConstant> pEC(
+ new ExpressionConstant(pBsonElement));
+ return pEC;
+ }
+
+ ExpressionConstant::ExpressionConstant(BSONElement *pBsonElement):
+ pValue(Value::createFromBsonElement(pBsonElement)) {
+ }
+
+ intrusive_ptr<ExpressionConstant> ExpressionConstant::create(
+ const intrusive_ptr<const Value> &pValue) {
+ intrusive_ptr<ExpressionConstant> pEC(new ExpressionConstant(pValue));
+ return pEC;
+ }
+
+ ExpressionConstant::ExpressionConstant(
+ const intrusive_ptr<const Value> &pTheValue):
+ pValue(pTheValue) {
+ }
+
+
+ intrusive_ptr<Expression> ExpressionConstant::optimize() {
+ /* nothing to do */
+ return intrusive_ptr<Expression>(this);
+ }
+
+ intrusive_ptr<const Value> ExpressionConstant::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ return pValue;
+ }
+
+ void ExpressionConstant::addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+
+ /*
+ For depth greater than one, do the regular thing
+
+ This will be one because any top level expression will actually
+ be an operator node, so by the time we get to an expression
+ constant, we're at level 1 (counting up as we go down the
+ expression tree).
+
+ See the comment below for more on why this happens.
+ */
+ if (depth > 1) {
+ pValue->addToBsonObj(pBuilder, fieldName);
+ return;
+ }
+
+ /*
+ If this happens at the top level, we don't have any direct way
+ to express it. However, we may need to if constant folding
+ reduced expressions to constants, and we need to re-materialize
+ the pipeline in order to ship it to a shard server. This has
+ forced the introduction of {$const: ...}.
+ */
+ BSONObjBuilder constBuilder;
+ pValue->addToBsonObj(&constBuilder, "$const");
+ pBuilder->append(fieldName, constBuilder.done());
+ }
+
+ void ExpressionConstant::addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const {
+ pValue->addToBsonArray(pBuilder);
+ }
+
+ const char *ExpressionConstant::getOpName() const {
+ assert(false); // this has no name
+ return NULL;
+ }
+
+ /* ---------------------- ExpressionDayOfMonth ------------------------- */
+
+ ExpressionDayOfMonth::~ExpressionDayOfMonth() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionDayOfMonth::create() {
+ intrusive_ptr<ExpressionDayOfMonth> pExpression(new ExpressionDayOfMonth());
+ return pExpression;
+ }
+
+ ExpressionDayOfMonth::ExpressionDayOfMonth():
+ ExpressionNary() {
+ }
+
+ void ExpressionDayOfMonth::addOperand(const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionDayOfMonth::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+ tm date;
+ (pDate->coerceToDate()).toTm(&date);
+ return Value::createInt(date.tm_mday);
+ }
+
+ const char *ExpressionDayOfMonth::getOpName() const {
+ return "$dayOfMonth";
+ }
+
+ /* ------------------------- ExpressionDayOfWeek ----------------------------- */
+
+ ExpressionDayOfWeek::~ExpressionDayOfWeek() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionDayOfWeek::create() {
+ intrusive_ptr<ExpressionDayOfWeek> pExpression(new ExpressionDayOfWeek());
+ return pExpression;
+ }
+
+ ExpressionDayOfWeek::ExpressionDayOfWeek():
+ ExpressionNary() {
+ }
+
+ void ExpressionDayOfWeek::addOperand(const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionDayOfWeek::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+ tm date;
+ (pDate->coerceToDate()).toTm(&date);
+ return Value::createInt(date.tm_wday+1); // MySQL uses 1-7 tm uses 0-6
+ }
+
+ const char *ExpressionDayOfWeek::getOpName() const {
+ return "$dayOfWeek";
+ }
+
+ /* ------------------------- ExpressionDayOfYear ----------------------------- */
+
+ ExpressionDayOfYear::~ExpressionDayOfYear() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionDayOfYear::create() {
+ intrusive_ptr<ExpressionDayOfYear> pExpression(new ExpressionDayOfYear());
+ return pExpression;
+ }
+
+ ExpressionDayOfYear::ExpressionDayOfYear():
+ ExpressionNary() {
+ }
+
+ void ExpressionDayOfYear::addOperand(const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionDayOfYear::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+ tm date;
+ (pDate->coerceToDate()).toTm(&date);
+ return Value::createInt(date.tm_yday+1); // MySQL uses 1-366 tm uses 0-365
+ }
+
+ const char *ExpressionDayOfYear::getOpName() const {
+ return "$dayOfYear";
+ }
+
+ /* ----------------------- ExpressionDivide ---------------------------- */
+
+ ExpressionDivide::~ExpressionDivide() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionDivide::create() {
+ intrusive_ptr<ExpressionDivide> pExpression(new ExpressionDivide());
+ return pExpression;
+ }
+
+ ExpressionDivide::ExpressionDivide():
+ ExpressionNary() {
+ }
+
+ void ExpressionDivide::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(2);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionDivide::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(2);
+ intrusive_ptr<const Value> pLeft(vpOperand[0]->evaluate(pDocument));
+ intrusive_ptr<const Value> pRight(vpOperand[1]->evaluate(pDocument));
+
+ double right = pRight->coerceToDouble();
+ if (right == 0)
+ return Value::getUndefined();
+
+ double left = pLeft->coerceToDouble();
+
+ return Value::createDouble(left / right);
+ }
+
+ const char *ExpressionDivide::getOpName() const {
+ return "$divide";
+ }
+
+ /* ---------------------- ExpressionObject --------------------------- */
+
+ ExpressionObject::~ExpressionObject() {
+ }
+
+ intrusive_ptr<ExpressionObject> ExpressionObject::create() {
+ intrusive_ptr<ExpressionObject> pExpression(new ExpressionObject());
+ return pExpression;
+ }
+
+ ExpressionObject::ExpressionObject():
+ excludePaths(false),
+ path(),
+ vFieldName(),
+ vpExpression() {
+ }
+
+ intrusive_ptr<Expression> ExpressionObject::optimize() {
+ const size_t n = vpExpression.size();
+ for(size_t i = 0; i < n; ++i) {
+ intrusive_ptr<Expression> pE(vpExpression[i]->optimize());
+ vpExpression[i] = pE;
+ }
+
+ return intrusive_ptr<Expression>(this);
+ }
+
+ void ExpressionObject::addToDocument(
+ const intrusive_ptr<Document> &pResult,
+ const intrusive_ptr<Document> &pDocument) const {
+ const size_t pathSize = path.size();
+ set<string>::const_iterator end(path.end());
+
+ /*
+ Take care of inclusions or exclusions. Note that _id is special,
+ that that it is always included, unless it is specifically excluded.
+ we use excludeId for that in case excludePaths if false, which means
+ to include paths.
+ */
+ if (pathSize) {
+ auto_ptr<FieldIterator> pIter(pDocument->createFieldIterator());
+ if (excludePaths) {
+ while(pIter->more()) {
+ pair<string, intrusive_ptr<const Value> > field(pIter->next());
+
+ /*
+ If the field in the document is not in the exclusion set,
+ add it to the result document.
+
+ Note that exclusions are only allowed on leaves, so we
+ can assume we don't have to descend recursively here.
+ */
+ if (path.find(field.first) != end)
+ continue; // we found it, so don't add it
+
+ pResult->addField(field.first, field.second);
+ }
+ }
+ else { /* !excludePaths */
+ while(pIter->more()) {
+ pair<string, intrusive_ptr<const Value> > field(
+ pIter->next());
+ /*
+ If the field in the document is in the inclusion set,
+ add it to the result document. Or, if we're not
+ excluding _id, and it is _id, include it.
+
+ Note that this could be an inclusion along a pathway,
+ so we look for an ExpressionObject in vpExpression; when
+ we find one, we populate the result with the evaluation
+ of that on the nested object, yielding relative paths.
+ This also allows us to handle intermediate arrays; if we
+ encounter one, we repeat this for each array element.
+ */
+ if (path.find(field.first) != end) {
+ /* find the Expression */
+ const size_t n = vFieldName.size();
+ size_t i;
+ Expression *pE = NULL;
+ for(i = 0; i < n; ++i) {
+ if (field.first.compare(vFieldName[i]) == 0) {
+ pE = vpExpression[i].get();
+ break;
+ }
+ }
+
+ /*
+ If we didn't find an expression, it's the last path
+ element to include.
+ */
+ if (!pE) {
+ pResult->addField(field.first, field.second);
+ continue;
+ }
+
+ ExpressionObject *pChild =
+ dynamic_cast<ExpressionObject *>(pE);
+ assert(pChild);
+
+ /*
+ Check on the type of the result object. If it's an
+ object, just walk down into that recursively, and
+ add it to the result.
+ */
+ BSONType valueType = field.second->getType();
+ if (valueType == Object) {
+ intrusive_ptr<Document> pD(
+ pChild->evaluateDocument(
+ field.second->getDocument()));
+ pResult->addField(vFieldName[i],
+ Value::createDocument(pD));
+ }
+ else if (valueType == Array) {
+ /*
+ If it's an array, we have to do the same thing,
+ but to each array element. Then, add the array
+ of results to the current document.
+ */
+ vector<intrusive_ptr<const Value> > result;
+ intrusive_ptr<ValueIterator> pVI(
+ field.second->getArray());
+ while(pVI->more()) {
+ intrusive_ptr<Document> pD(
+ pChild->evaluateDocument(
+ pVI->next()->getDocument()));
+ result.push_back(Value::createDocument(pD));
+ }
+
+ pResult->addField(vFieldName[i],
+ Value::createArray(result));
+ }
+ }
+ }
+ }
+ }
+
+ /* add any remaining fields we haven't already taken care of */
+ const size_t n = vFieldName.size();
+ for(size_t i = 0; i < n; ++i) {
+ string fieldName(vFieldName[i]);
+
+ /* if we've already dealt with this field, above, do nothing */
+ if (path.find(fieldName) != end)
+ continue;
+
+ intrusive_ptr<const Value> pValue(
+ vpExpression[i]->evaluate(pDocument));
+
+ /*
+ Don't add non-existent values (note: different from NULL);
+ this is consistent with existing selection syntax which doesn't
+ force the appearnance of non-existent fields.
+ */
+ if (pValue->getType() == Undefined)
+ continue;
+
+ pResult->addField(fieldName, pValue);
+ }
+ }
+
+ size_t ExpressionObject::getSizeHint(
+ const intrusive_ptr<Document> &pDocument) const {
+ size_t sizeHint = pDocument->getFieldCount();
+ const size_t pathSize = path.size();
+ if (!excludePaths)
+ sizeHint += pathSize;
+ else {
+ size_t excludeCount = pathSize;
+ if (sizeHint > excludeCount)
+ sizeHint -= excludeCount;
+ else
+ sizeHint = 0;
+ }
+
+ /* account for the additional computed fields */
+ sizeHint += vFieldName.size();
+
+ return sizeHint;
+ }
+
+ intrusive_ptr<Document> ExpressionObject::evaluateDocument(
+ const intrusive_ptr<Document> &pDocument) const {
+ /* create and populate the result */
+ intrusive_ptr<Document> pResult(
+ Document::create(getSizeHint(pDocument)));
+ addToDocument(pResult, pDocument);
+ return pResult;
+ }
+
+ intrusive_ptr<const Value> ExpressionObject::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ return Value::createDocument(evaluateDocument(pDocument));
+ }
+
+ void ExpressionObject::addField(const string &fieldName,
+ const intrusive_ptr<Expression> &pExpression) {
+ /* must have an expression */
+ assert(pExpression.get());
+
+ /* parse the field path */
+ FieldPath fieldPath(fieldName);
+ uassert(16008, str::stream() <<
+ "an expression object's field names cannot be field paths (at \"" <<
+ fieldName << "\")", fieldPath.getPathLength() == 1);
+
+ /* make sure it isn't a name we've included or excluded */
+ set<string>::iterator ex(path.find(fieldName));
+ uassert(16009, str::stream() <<
+ "can't add a field to an object expression that has already been excluded (at \"" <<
+ fieldName << "\")", ex == path.end());
+
+ /* make sure it isn't a name we've already got */
+ const size_t n = vFieldName.size();
+ for(size_t i = 0; i < n; ++i) {
+ uassert(16010, str::stream() <<
+ "can't add the same field to an object expression more than once (at \"" <<
+ fieldName << "\")",
+ fieldName.compare(vFieldName[i]) != 0);
+ }
+
+ vFieldName.push_back(fieldName);
+ vpExpression.push_back(pExpression);
+ }
+
+ void ExpressionObject::includePath(
+ const FieldPath *pPath, size_t pathi, size_t pathn, bool excludeLast) {
+
+ /* get the current path field name */
+ string fieldName(pPath->getFieldName(pathi));
+ uassert(16011,
+ "an object expression can't include an empty field-name",
+ fieldName.length());
+
+ const size_t pathCount = path.size();
+
+ /* if this is the leaf-most object, stop */
+ if (pathi == pathn - 1) {
+ /*
+ Make sure the exclusion configuration of this node matches
+ the requested result. Or, that this is the first (determining)
+ specification.
+ */
+ uassert(16012, str::stream() <<
+ "incompatible exclusion for \"" <<
+ pPath->getPath(false) <<
+ "\" because of a prior inclusion that includes a common sub-path",
+ ((excludePaths == excludeLast) || !pathCount));
+
+ excludePaths = excludeLast; // if (!pathCount), set this
+ path.insert(fieldName);
+ return;
+ }
+
+ /* this level had better be about inclusions */
+ uassert(16013, str::stream() <<
+ "incompatible inclusion for \"" << pPath->getPath(false) <<
+ "\" because of a prior exclusion that includes a common sub-path",
+ !excludePaths);
+
+ /* see if we already know about this field */
+ const size_t n = vFieldName.size();
+ size_t i;
+ for(i = 0; i < n; ++i) {
+ if (fieldName.compare(vFieldName[i]) == 0)
+ break;
+ }
+
+ /* find the right object, and continue */
+ ExpressionObject *pChild;
+ if (i < n) {
+ /* the intermediate child already exists */
+ pChild = dynamic_cast<ExpressionObject *>(vpExpression[i].get());
+ assert(pChild);
+ }
+ else {
+ /*
+ If we get here, the intervening child isn't already there,
+ so create it.
+ */
+ intrusive_ptr<ExpressionObject> pSharedChild(
+ ExpressionObject::create());
+ path.insert(fieldName);
+ vFieldName.push_back(fieldName);
+ vpExpression.push_back(pSharedChild);
+ pChild = pSharedChild.get();
+ }
+
+ // LATER CW TODO turn this into a loop
+ pChild->includePath(pPath, pathi + 1, pathn, excludeLast);
+ }
+
+ void ExpressionObject::includePath(const string &theFieldPath) {
+ /* parse the field path */
+ FieldPath fieldPath(theFieldPath);
+ includePath(&fieldPath, 0, fieldPath.getPathLength(), false);
+ }
+
+ void ExpressionObject::excludePath(const string &theFieldPath) {
+ /* parse the field path */
+ FieldPath fieldPath(theFieldPath);
+ includePath(&fieldPath, 0, fieldPath.getPathLength(), true);
+ }
+
+ intrusive_ptr<Expression> ExpressionObject::getField(
+ const string &fieldName) const {
+ const size_t n = vFieldName.size();
+ for(size_t i = 0; i < n; ++i) {
+ if (fieldName.compare(vFieldName[i]) == 0)
+ return vpExpression[i];
+ }
+
+ /* if we got here, we didn't find it */
+ return intrusive_ptr<Expression>();
+ }
+
+ void ExpressionObject::emitPaths(
+ BSONObjBuilder *pBuilder, vector<string> *pvPath) const {
+ if (!path.size())
+ return;
+
+ /* we use these for loops */
+ const size_t nField = vFieldName.size();
+ const size_t nPath = pvPath->size();
+
+ /*
+ We can iterate over the inclusion/exclusion paths in their
+ (random) set order because they don't affect the order that
+ fields are listed in the result. That comes from the underlying
+ Document they are fetched from.
+ */
+ for(set<string>::const_iterator end(path.end()),
+ iter(path.begin()); iter != end; ++iter) {
+
+ /* find the matching field description */
+ size_t iField = 0;
+ for(; iField < nField; ++iField) {
+ if (iter->compare(vFieldName[iField]) == 0)
+ break;
+ }
+
+ if (iField == nField) {
+ /*
+ If we didn't find a matching field description, this is the
+ leaf, so add the path.
+ */
+ stringstream ss;
+
+ for(size_t iPath = 0; iPath < nPath; ++iPath)
+ ss << (*pvPath)[iPath] << ".";
+ ss << *iter;
+
+ pBuilder->append(ss.str(), !excludePaths);
+ }
+ else {
+ /*
+ If we found a matching field description, then we need to
+ descend into the next level.
+ */
+ Expression *pE = vpExpression[iField].get();
+ ExpressionObject *pEO = dynamic_cast<ExpressionObject *>(pE);
+ assert(pEO);
+
+ /*
+ Add the current field name to the path being built up,
+ then go down into the next level.
+ */
+ PathPusher pathPusher(pvPath, vFieldName[iField]);
+ pEO->emitPaths(pBuilder, pvPath);
+ }
+ }
+ }
+
+ void ExpressionObject::documentToBson(
+ BSONObjBuilder *pBuilder, unsigned depth) const {
+
+ /* emit any inclusion/exclusion paths */
+ vector<string> vPath;
+ emitPaths(pBuilder, &vPath);
+
+ /* then add any expressions */
+ const size_t nField = vFieldName.size();
+ const set<string>::const_iterator pathEnd(path.end());
+ for(size_t iField = 0; iField < nField; ++iField) {
+ string fieldName(vFieldName[iField]);
+
+ /* if we already took care of this, don't repeat it */
+ if (path.find(fieldName) != pathEnd)
+ continue;
+
+ vpExpression[iField]->addToBsonObj(pBuilder, fieldName, depth + 1);
+ }
+ }
+
+ void ExpressionObject::addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+
+ BSONObjBuilder objBuilder;
+ documentToBson(&objBuilder, depth);
+ pBuilder->append(fieldName, objBuilder.done());
+ }
+
+ void ExpressionObject::addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const {
+
+ BSONObjBuilder objBuilder;
+ documentToBson(&objBuilder, depth);
+ pBuilder->append(objBuilder.done());
+ }
+
+ /* --------------------- ExpressionFieldPath --------------------------- */
+
+ ExpressionFieldPath::~ExpressionFieldPath() {
+ }
+
+ intrusive_ptr<ExpressionFieldPath> ExpressionFieldPath::create(
+ const string &fieldPath) {
+ intrusive_ptr<ExpressionFieldPath> pExpression(
+ new ExpressionFieldPath(fieldPath));
+ return pExpression;
+ }
+
+ ExpressionFieldPath::ExpressionFieldPath(
+ const string &theFieldPath):
+ fieldPath(theFieldPath) {
+ }
+
+ intrusive_ptr<Expression> ExpressionFieldPath::optimize() {
+ /* nothing can be done for these */
+ return intrusive_ptr<Expression>(this);
+ }
+
+ intrusive_ptr<const Value> ExpressionFieldPath::evaluatePath(
+ size_t index, const size_t pathLength,
+ intrusive_ptr<Document> pDocument) const {
+ intrusive_ptr<const Value> pValue; /* the return value */
+
+ pValue = pDocument->getValue(fieldPath.getFieldName(index));
+
+ /* if the field doesn't exist, quit with an undefined value */
+ if (!pValue.get())
+ return Value::getUndefined();
+
+ /* if we've hit the end of the path, stop */
+ ++index;
+ if (index >= pathLength)
+ return pValue;
+
+ /*
+ We're diving deeper. If the value was null, return null.
+ */
+ BSONType type = pValue->getType();
+ if ((type == Undefined) || (type == jstNULL))
+ return Value::getUndefined();
+
+ if (type == Object) {
+ /* extract from the next level down */
+ return evaluatePath(index, pathLength, pValue->getDocument());
+ }
+
+ if (type == Array) {
+ /*
+ We're going to repeat this for each member of the array,
+ building up a new array as we go.
+ */
+ vector<intrusive_ptr<const Value> > result;
+ intrusive_ptr<ValueIterator> pIter(pValue->getArray());
+ while(pIter->more()) {
+ intrusive_ptr<const Value> pItem(pIter->next());
+ BSONType iType = pItem->getType();
+ if ((iType == Undefined) || (iType == jstNULL)) {
+ result.push_back(pItem);
+ continue;
+ }
+
+ uassert(16014, str::stream() <<
+ "the element \"" << fieldPath.getFieldName(index) <<
+ "\" along the dotted path \"" <<
+ fieldPath.getPath(false) <<
+ "\" is not an object, and cannot be navigated",
+ iType == Object);
+ intrusive_ptr<const Value> itemResult(
+ evaluatePath(index, pathLength, pItem->getDocument()));
+ result.push_back(itemResult);
+ }
+
+ return Value::createArray(result);
+ }
+
+ uassert(16015, str::stream() <<
+ "can't navigate into value of type " << type <<
+ "at \"" << fieldPath.getFieldName(index) <<
+ "\" in dotted path \"" << fieldPath.getPath(false),
+ false);
+ return intrusive_ptr<const Value>();
+ }
+
+ intrusive_ptr<const Value> ExpressionFieldPath::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ return evaluatePath(0, fieldPath.getPathLength(), pDocument);
+ }
+
+ void ExpressionFieldPath::addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+ pBuilder->append(fieldName, fieldPath.getPath(true));
+ }
+
+ void ExpressionFieldPath::addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const {
+ pBuilder->append(getFieldPath(true));
+ }
+
+ /* --------------------- ExpressionFieldPath --------------------------- */
+
+ ExpressionFieldRange::~ExpressionFieldRange() {
+ }
+
+ intrusive_ptr<Expression> ExpressionFieldRange::optimize() {
+ /* if there is no range to match, this will never evaluate true */
+ if (!pRange.get())
+ return ExpressionConstant::create(Value::getFalse());
+
+ /*
+ If we ended up with a double un-ended range, anything matches. I
+ don't know how that can happen, given intersect()'s interface, but
+ here it is, just in case.
+ */
+ if (!pRange->pBottom.get() && !pRange->pTop.get())
+ return ExpressionConstant::create(Value::getTrue());
+
+ /*
+ In all other cases, we have to test candidate values. The
+ intersect() method has already optimized those tests, so there
+ aren't any more optimizations to look for here.
+ */
+ return intrusive_ptr<Expression>(this);
+ }
+
+ intrusive_ptr<const Value> ExpressionFieldRange::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ /* if there's no range, there can't be a match */
+ if (!pRange.get())
+ return Value::getFalse();
+
+ /* get the value of the specified field */
+ intrusive_ptr<const Value> pValue(pFieldPath->evaluate(pDocument));
+
+ /* see if it fits within any of the ranges */
+ if (pRange->contains(pValue))
+ return Value::getTrue();
+
+ return Value::getFalse();
+ }
+
+ void ExpressionFieldRange::addToBson(
+ Builder *pBuilder, unsigned depth) const {
+ if (!pRange.get()) {
+ /* nothing will satisfy this predicate */
+ pBuilder->append(false);
+ return;
+ }
+
+ if (!pRange->pTop.get() && !pRange->pBottom.get()) {
+ /* any value will satisfy this predicate */
+ pBuilder->append(true);
+ return;
+ }
+
+ if (pRange->pTop.get() == pRange->pBottom.get()) {
+ BSONArrayBuilder operands;
+ pFieldPath->addToBsonArray(&operands, depth);
+ pRange->pTop->addToBsonArray(&operands);
+
+ BSONObjBuilder equals;
+ equals.append("$eq", operands.arr());
+ pBuilder->append(&equals);
+ return;
+ }
+
+ BSONObjBuilder leftOperator;
+ if (pRange->pBottom.get()) {
+ BSONArrayBuilder leftOperands;
+ pFieldPath->addToBsonArray(&leftOperands, depth);
+ pRange->pBottom->addToBsonArray(&leftOperands);
+ leftOperator.append(
+ (pRange->bottomOpen ? "$gt" : "$gte"),
+ leftOperands.arr());
+
+ if (!pRange->pTop.get()) {
+ pBuilder->append(&leftOperator);
+ return;
+ }
+ }
+
+ BSONObjBuilder rightOperator;
+ if (pRange->pTop.get()) {
+ BSONArrayBuilder rightOperands;
+ pFieldPath->addToBsonArray(&rightOperands, depth);
+ pRange->pTop->addToBsonArray(&rightOperands);
+ rightOperator.append(
+ (pRange->topOpen ? "$lt" : "$lte"),
+ rightOperands.arr());
+
+ if (!pRange->pBottom.get()) {
+ pBuilder->append(&rightOperator);
+ return;
+ }
+ }
+
+ BSONArrayBuilder andOperands;
+ andOperands.append(leftOperator.done());
+ andOperands.append(rightOperator.done());
+ BSONObjBuilder andOperator;
+ andOperator.append("$and", andOperands.arr());
+ pBuilder->append(&andOperator);
+ }
+
+ void ExpressionFieldRange::addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+ BuilderObj builder(pBuilder, fieldName);
+ addToBson(&builder, depth);
+ }
+
+ void ExpressionFieldRange::addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const {
+ BuilderArray builder(pBuilder);
+ addToBson(&builder, depth);
+ }
+
+ void ExpressionFieldRange::toMatcherBson(
+ BSONObjBuilder *pBuilder, unsigned depth) const {
+ assert(pRange.get()); // otherwise, we can't do anything
+
+ /* if there are no endpoints, then every value is accepted */
+ if (!pRange->pBottom.get() && !pRange->pTop.get())
+ return; // nothing to add to the predicate
+
+ /* we're going to need the field path */
+ string fieldPath(pFieldPath->getFieldPath(false));
+
+ BSONObjBuilder range;
+ if (pRange->pBottom.get()) {
+ /* the test for equality doesn't generate a subobject */
+ if (pRange->pBottom.get() == pRange->pTop.get()) {
+ pRange->pBottom->addToBsonObj(pBuilder, fieldPath);
+ return;
+ }
+
+ pRange->pBottom->addToBsonObj(
+ pBuilder, (pRange->bottomOpen ? "$gt" : "$gte"));
+ }
+
+ if (pRange->pTop.get()) {
+ pRange->pTop->addToBsonObj(
+ pBuilder, (pRange->topOpen ? "$lt" : "$lte"));
+ }
+
+ pBuilder->append(fieldPath, range.done());
+ }
+
+ intrusive_ptr<ExpressionFieldRange> ExpressionFieldRange::create(
+ const intrusive_ptr<ExpressionFieldPath> &pFieldPath, CmpOp cmpOp,
+ const intrusive_ptr<const Value> &pValue) {
+ intrusive_ptr<ExpressionFieldRange> pE(
+ new ExpressionFieldRange(pFieldPath, cmpOp, pValue));
+ return pE;
+ }
+
+ ExpressionFieldRange::ExpressionFieldRange(
+ const intrusive_ptr<ExpressionFieldPath> &pTheFieldPath, CmpOp cmpOp,
+ const intrusive_ptr<const Value> &pValue):
+ pFieldPath(pTheFieldPath),
+ pRange(new Range(cmpOp, pValue)) {
+ }
+
+ void ExpressionFieldRange::intersect(
+ CmpOp cmpOp, const intrusive_ptr<const Value> &pValue) {
+
+ /* create the new range */
+ scoped_ptr<Range> pNew(new Range(cmpOp, pValue));
+
+ /*
+ Go through the range list. For every range, either add the
+ intersection of that to the range list, or if there is none, the
+ original range. This has the effect of restricting overlapping
+ ranges, but leaving non-overlapping ones as-is.
+ */
+ pRange.reset(pRange->intersect(pNew.get()));
+ }
+
+ ExpressionFieldRange::Range::Range(
+ CmpOp cmpOp, const intrusive_ptr<const Value> &pValue):
+ bottomOpen(false),
+ topOpen(false),
+ pBottom(),
+ pTop() {
+ switch(cmpOp) {
+ case NE:
+ bottomOpen = topOpen = true;
+ /* FALLTHROUGH */
+ case EQ:
+ pBottom = pTop = pValue;
+ break;
+
+ case GT:
+ bottomOpen = true;
+ /* FALLTHROUGH */
+ case GTE:
+ topOpen = true;
+ pBottom = pValue;
+ break;
+
+ case LT:
+ topOpen = true;
+ /* FALLTHROUGH */
+ case LTE:
+ bottomOpen = true;
+ pTop = pValue;
+ break;
+
+ case CMP:
+ assert(false); // not allowed
+ break;
+ }
+ }
+
+ ExpressionFieldRange::Range::Range(const Range &rRange):
+ bottomOpen(rRange.bottomOpen),
+ topOpen(rRange.topOpen),
+ pBottom(rRange.pBottom),
+ pTop(rRange.pTop) {
+ }
+
+ ExpressionFieldRange::Range::Range(
+ const intrusive_ptr<const Value> &pTheBottom, bool theBottomOpen,
+ const intrusive_ptr<const Value> &pTheTop, bool theTopOpen):
+ bottomOpen(theBottomOpen),
+ topOpen(theTopOpen),
+ pBottom(pTheBottom),
+ pTop(pTheTop) {
+ }
+
+ ExpressionFieldRange::Range *ExpressionFieldRange::Range::intersect(
+ const Range *pRange) const {
+ /*
+ Find the max of the bottom end of the ranges.
+
+ Start by assuming the maximum is from pRange. Then, if we have
+ values of our own, see if they're greater.
+ */
+ intrusive_ptr<const Value> pMaxBottom(pRange->pBottom);
+ bool maxBottomOpen = pRange->bottomOpen;
+ if (pBottom.get()) {
+ if (!pRange->pBottom.get()) {
+ pMaxBottom = pBottom;
+ maxBottomOpen = bottomOpen;
+ }
+ else {
+ const int cmp = Value::compare(pBottom, pRange->pBottom);
+ if (cmp == 0)
+ maxBottomOpen = bottomOpen || pRange->bottomOpen;
+ else if (cmp > 0) {
+ pMaxBottom = pBottom;
+ maxBottomOpen = bottomOpen;
+ }
+ }
+ }
+
+ /*
+ Find the minimum of the tops of the ranges.
+
+ Start by assuming the minimum is from pRange. Then, if we have
+ values of our own, see if they are less.
+ */
+ intrusive_ptr<const Value> pMinTop(pRange->pTop);
+ bool minTopOpen = pRange->topOpen;
+ if (pTop.get()) {
+ if (!pRange->pTop.get()) {
+ pMinTop = pTop;
+ minTopOpen = topOpen;
+ }
+ else {
+ const int cmp = Value::compare(pTop, pRange->pTop);
+ if (cmp == 0)
+ minTopOpen = topOpen || pRange->topOpen;
+ else if (cmp < 0) {
+ pMinTop = pTop;
+ minTopOpen = topOpen;
+ }
+ }
+ }
+
+ /*
+ If the intersections didn't create a disjoint set, create the
+ new range.
+ */
+ if (Value::compare(pMaxBottom, pMinTop) <= 0)
+ return new Range(pMaxBottom, maxBottomOpen, pMinTop, minTopOpen);
+
+ /* if we got here, the intersection is empty */
+ return NULL;
+ }
+
+ bool ExpressionFieldRange::Range::contains(
+ const intrusive_ptr<const Value> &pValue) const {
+ if (pBottom.get()) {
+ const int cmp = Value::compare(pValue, pBottom);
+ if (cmp < 0)
+ return false;
+ if (bottomOpen && (cmp == 0))
+ return false;
+ }
+
+ if (pTop.get()) {
+ const int cmp = Value::compare(pValue, pTop);
+ if (cmp > 0)
+ return false;
+ if (topOpen && (cmp == 0))
+ return false;
+ }
+
+ return true;
+ }
+
+ /* ------------------------- ExpressionMinute ----------------------------- */
+
+ ExpressionMinute::~ExpressionMinute() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionMinute::create() {
+ intrusive_ptr<ExpressionMinute> pExpression(new ExpressionMinute());
+ return pExpression;
+ }
+
+ ExpressionMinute::ExpressionMinute():
+ ExpressionNary() {
+ }
+
+ void ExpressionMinute::addOperand(const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionMinute::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+ tm date;
+ (pDate->coerceToDate()).toTm(&date);
+ return Value::createInt(date.tm_min);
+ }
+
+ const char *ExpressionMinute::getOpName() const {
+ return "$minute";
+ }
+
+ /* ----------------------- ExpressionMod ---------------------------- */
+
+ ExpressionMod::~ExpressionMod() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionMod::create() {
+ intrusive_ptr<ExpressionMod> pExpression(new ExpressionMod());
+ return pExpression;
+ }
+
+ ExpressionMod::ExpressionMod():
+ ExpressionNary() {
+ }
+
+ void ExpressionMod::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(2);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionMod::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ BSONType productType;
+ checkArgCount(2);
+ intrusive_ptr<const Value> pLeft(vpOperand[0]->evaluate(pDocument));
+ intrusive_ptr<const Value> pRight(vpOperand[1]->evaluate(pDocument));
+
+ productType = Value::getWidestNumeric(pRight->getType(), pLeft->getType());
+
+ long long right = pRight->coerceToLong();
+ if (right == 0)
+ return Value::getUndefined();
+
+ long long left = pLeft->coerceToLong();
+ if (productType == NumberLong)
+ return Value::createLong(left % right);
+ return Value::createInt((int)left % right);
+ }
+
+ const char *ExpressionMod::getOpName() const {
+ return "$mod";
+ }
+
+ /* ------------------------- ExpressionMonth ----------------------------- */
+
+ ExpressionMonth::~ExpressionMonth() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionMonth::create() {
+ intrusive_ptr<ExpressionMonth> pExpression(new ExpressionMonth());
+ return pExpression;
+ }
+
+ ExpressionMonth::ExpressionMonth():
+ ExpressionNary() {
+ }
+
+ void ExpressionMonth::addOperand(const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionMonth::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+ tm date;
+ (pDate->coerceToDate()).toTm(&date);
+ return Value::createInt(date.tm_mon+1); // MySQL uses 1-12 tm uses 0-11
+ }
+
+ const char *ExpressionMonth::getOpName() const {
+ return "$month";
+ }
+
+ /* ------------------------- ExpressionMultiply ----------------------------- */
+
+ ExpressionMultiply::~ExpressionMultiply() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionMultiply::create() {
+ intrusive_ptr<ExpressionMultiply> pExpression(new ExpressionMultiply());
+ return pExpression;
+ }
+
+ ExpressionMultiply::ExpressionMultiply():
+ ExpressionNary() {
+ }
+
+ intrusive_ptr<const Value> ExpressionMultiply::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ /*
+ We'll try to return the narrowest possible result value. To do that
+ without creating intermediate Values, do the arithmetic for double
+ and integral types in parallel, tracking the current narrowest
+ type.
+ */
+ double doubleProduct = 1;
+ long long longProduct = 1;
+ BSONType productType = NumberInt;
+
+ const size_t n = vpOperand.size();
+ for(size_t i = 0; i < n; ++i) {
+ intrusive_ptr<const Value> pValue(vpOperand[i]->evaluate(pDocument));
+
+ productType = Value::getWidestNumeric(productType, pValue->getType());
+ doubleProduct *= pValue->coerceToDouble();
+ longProduct *= pValue->coerceToLong();
+ }
+
+ if (productType == NumberDouble)
+ return Value::createDouble(doubleProduct);
+ if (productType == NumberLong)
+ return Value::createLong(longProduct);
+ return Value::createInt((int)longProduct);
+ }
+
+ const char *ExpressionMultiply::getOpName() const {
+ return "$multiply";
+ }
+
+ intrusive_ptr<ExpressionNary> (*ExpressionMultiply::getFactory() const)() {
+ return ExpressionMultiply::create;
+ }
+
+ /* ------------------------- ExpressionHour ----------------------------- */
+
+ ExpressionHour::~ExpressionHour() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionHour::create() {
+ intrusive_ptr<ExpressionHour> pExpression(new ExpressionHour());
+ return pExpression;
+ }
+
+ ExpressionHour::ExpressionHour():
+ ExpressionNary() {
+ }
+
+ void ExpressionHour::addOperand(const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionHour::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+ tm date;
+ (pDate->coerceToDate()).toTm(&date);
+ return Value::createInt(date.tm_hour);
+ }
+
+ const char *ExpressionHour::getOpName() const {
+ return "$hour";
+ }
+
+ /* ----------------------- ExpressionIfNull ---------------------------- */
+
+ ExpressionIfNull::~ExpressionIfNull() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionIfNull::create() {
+ intrusive_ptr<ExpressionIfNull> pExpression(new ExpressionIfNull());
+ return pExpression;
+ }
+
+ ExpressionIfNull::ExpressionIfNull():
+ ExpressionNary() {
+ }
+
+ void ExpressionIfNull::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(2);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionIfNull::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(2);
+ intrusive_ptr<const Value> pLeft(vpOperand[0]->evaluate(pDocument));
+ BSONType leftType = pLeft->getType();
+
+ if ((leftType != Undefined) && (leftType != jstNULL))
+ return pLeft;
+
+ intrusive_ptr<const Value> pRight(vpOperand[1]->evaluate(pDocument));
+ return pRight;
+ }
+
+ const char *ExpressionIfNull::getOpName() const {
+ return "$ifNull";
+ }
+
+ /* ------------------------ ExpressionNary ----------------------------- */
+
+ ExpressionNary::ExpressionNary():
+ vpOperand() {
+ }
+
+ intrusive_ptr<Expression> ExpressionNary::optimize() {
+ unsigned constCount = 0; // count of constant operands
+ unsigned stringCount = 0; // count of constant string operands
+ const size_t n = vpOperand.size();
+ for(size_t i = 0; i < n; ++i) {
+ intrusive_ptr<Expression> pNew(vpOperand[i]->optimize());
+
+ /* subsitute the optimized expression */
+ vpOperand[i] = pNew;
+
+ /* check to see if the result was a constant */
+ const ExpressionConstant *pConst =
+ dynamic_cast<ExpressionConstant *>(pNew.get());
+ if (pConst) {
+ ++constCount;
+ if (pConst->getValue()->getType() == String)
+ ++stringCount;
+ }
+ }
+
+ /*
+ If all the operands are constant, we can replace this expression
+ with a constant. We can find the value by evaluating this
+ expression over a NULL Document because evaluating the
+ ExpressionConstant never refers to the argument Document.
+ */
+ if (constCount == n) {
+ intrusive_ptr<const Value> pResult(
+ evaluate(intrusive_ptr<Document>()));
+ intrusive_ptr<Expression> pReplacement(
+ ExpressionConstant::create(pResult));
+ return pReplacement;
+ }
+
+ /*
+ If there are any strings, we can't re-arrange anything, so stop
+ now.
+
+ LATER: we could concatenate adjacent strings as a special case.
+ */
+ if (stringCount)
+ return intrusive_ptr<Expression>(this);
+
+ /*
+ If there's no more than one constant, then we can't do any
+ constant folding, so don't bother going any further.
+ */
+ if (constCount <= 1)
+ return intrusive_ptr<Expression>(this);
+
+ /*
+ If the operator isn't commutative or associative, there's nothing
+ more we can do. We test that by seeing if we can get a factory;
+ if we can, we can use it to construct a temporary expression which
+ we'll evaluate to collapse as many constants as we can down to
+ a single one.
+ */
+ intrusive_ptr<ExpressionNary> (*const pFactory)() = getFactory();
+ if (!pFactory)
+ return intrusive_ptr<Expression>(this);
+
+ /*
+ Create a new Expression that will be the replacement for this one.
+ We actually create two: one to hold constant expressions, and
+ one to hold non-constants. Once we've got these, we evaluate
+ the constant expression to produce a single value, as above.
+ We then add this operand to the end of the non-constant expression,
+ and return that.
+ */
+ intrusive_ptr<ExpressionNary> pNew((*pFactory)());
+ intrusive_ptr<ExpressionNary> pConst((*pFactory)());
+ for(size_t i = 0; i < n; ++i) {
+ intrusive_ptr<Expression> pE(vpOperand[i]);
+ if (dynamic_cast<ExpressionConstant *>(pE.get()))
+ pConst->addOperand(pE);
+ else {
+ /*
+ If the child operand is the same type as this, then we can
+ extract its operands and inline them here because we already
+ know this is commutative and associative because it has a
+ factory. We can detect sameness of the child operator by
+ checking for equality of the factory
+
+ Note we don't have to do this recursively, because we
+ called optimize() on all the children first thing in
+ this call to optimize().
+ */
+ ExpressionNary *pNary =
+ dynamic_cast<ExpressionNary *>(pE.get());
+ if (!pNary)
+ pNew->addOperand(pE);
+ else {
+ intrusive_ptr<ExpressionNary> (*const pChildFactory)() =
+ pNary->getFactory();
+ if (pChildFactory != pFactory)
+ pNew->addOperand(pE);
+ else {
+ /* same factory, so flatten */
+ size_t nChild = pNary->vpOperand.size();
+ for(size_t iChild = 0; iChild < nChild; ++iChild) {
+ intrusive_ptr<Expression> pCE(
+ pNary->vpOperand[iChild]);
+ if (dynamic_cast<ExpressionConstant *>(pCE.get()))
+ pConst->addOperand(pCE);
+ else
+ pNew->addOperand(pCE);
+ }
+ }
+ }
+ }
+ }
+
+ /*
+ If there was only one constant, add it to the end of the expression
+ operand vector.
+ */
+ if (pConst->vpOperand.size() == 1)
+ pNew->addOperand(pConst->vpOperand[0]);
+ else if (pConst->vpOperand.size() > 1) {
+ /*
+ If there was more than one constant, collapse all the constants
+ together before adding the result to the end of the expression
+ operand vector.
+ */
+ intrusive_ptr<const Value> pResult(
+ pConst->evaluate(intrusive_ptr<Document>()));
+ pNew->addOperand(ExpressionConstant::create(pResult));
+ }
+
+ return pNew;
+ }
+
+ void ExpressionNary::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ vpOperand.push_back(pExpression);
+ }
+
+ intrusive_ptr<ExpressionNary> (*ExpressionNary::getFactory() const)() {
+ return NULL;
+ }
+
+ void ExpressionNary::toBson(
+ BSONObjBuilder *pBuilder, const char *pOpName, unsigned depth) const {
+ const size_t nOperand = vpOperand.size();
+ assert(nOperand > 0);
+ if (nOperand == 1) {
+ vpOperand[0]->addToBsonObj(pBuilder, pOpName, depth + 1);
+ return;
+ }
+
+ /* build up the array */
+ BSONArrayBuilder arrBuilder;
+ for(size_t i = 0; i < nOperand; ++i)
+ vpOperand[i]->addToBsonArray(&arrBuilder, depth + 1);
+
+ pBuilder->append(pOpName, arrBuilder.arr());
+ }
+
+ void ExpressionNary::addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+ BSONObjBuilder exprBuilder;
+ toBson(&exprBuilder, getOpName(), depth);
+ pBuilder->append(fieldName, exprBuilder.done());
+ }
+
+ void ExpressionNary::addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const {
+ BSONObjBuilder exprBuilder;
+ toBson(&exprBuilder, getOpName(), depth);
+ pBuilder->append(exprBuilder.done());
+ }
+
+ void ExpressionNary::checkArgLimit(unsigned maxArgs) const {
+ uassert(15993, str::stream() << getOpName() <<
+ " only takes " << maxArgs <<
+ " operand" << (maxArgs == 1 ? "" : "s"),
+ vpOperand.size() < maxArgs);
+ }
+
+ void ExpressionNary::checkArgCount(unsigned reqArgs) const {
+ uassert(15997, str::stream() << getOpName() <<
+ ": insufficient operands; " << reqArgs <<
+ " required, only got " << vpOperand.size(),
+ vpOperand.size() == reqArgs);
+ }
+
+ /* ----------------------- ExpressionNoOp ------------------------------ */
+
+ ExpressionNoOp::~ExpressionNoOp() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionNoOp::create() {
+ intrusive_ptr<ExpressionNoOp> pExpression(new ExpressionNoOp());
+ return pExpression;
+ }
+
+ intrusive_ptr<Expression> ExpressionNoOp::optimize() {
+ checkArgCount(1);
+ intrusive_ptr<Expression> pR(vpOperand[0]->optimize());
+ return pR;
+ }
+
+ ExpressionNoOp::ExpressionNoOp():
+ ExpressionNary() {
+ }
+
+ void ExpressionNoOp::addOperand(const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionNoOp::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pValue(vpOperand[0]->evaluate(pDocument));
+ return pValue;
+ }
+
+ const char *ExpressionNoOp::getOpName() const {
+ return "$noOp";
+ }
+
+ /* ------------------------- ExpressionNot ----------------------------- */
+
+ ExpressionNot::~ExpressionNot() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionNot::create() {
+ intrusive_ptr<ExpressionNot> pExpression(new ExpressionNot());
+ return pExpression;
+ }
+
+ ExpressionNot::ExpressionNot():
+ ExpressionNary() {
+ }
+
+ void ExpressionNot::addOperand(const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionNot::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pOp(vpOperand[0]->evaluate(pDocument));
+
+ bool b = pOp->coerceToBool();
+ if (b)
+ return Value::getFalse();
+ return Value::getTrue();
+ }
+
+ const char *ExpressionNot::getOpName() const {
+ return "$not";
+ }
+
+ /* -------------------------- ExpressionOr ----------------------------- */
+
+ ExpressionOr::~ExpressionOr() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionOr::create() {
+ intrusive_ptr<ExpressionNary> pExpression(new ExpressionOr());
+ return pExpression;
+ }
+
+ ExpressionOr::ExpressionOr():
+ ExpressionNary() {
+ }
+
+ intrusive_ptr<const Value> ExpressionOr::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ const size_t n = vpOperand.size();
+ for(size_t i = 0; i < n; ++i) {
+ intrusive_ptr<const Value> pValue(vpOperand[i]->evaluate(pDocument));
+ if (pValue->coerceToBool())
+ return Value::getTrue();
+ }
+
+ return Value::getFalse();
+ }
+
+ void ExpressionOr::toMatcherBson(
+ BSONObjBuilder *pBuilder, unsigned depth) const {
+ BSONObjBuilder opArray;
+ const size_t n = vpOperand.size();
+ for(size_t i = 0; i < n; ++i)
+ vpOperand[i]->toMatcherBson(&opArray, depth + 1);
+
+ pBuilder->append("$or", opArray.done());
+ }
+
+ intrusive_ptr<ExpressionNary> (*ExpressionOr::getFactory() const)() {
+ return ExpressionOr::create;
+ }
+
+ intrusive_ptr<Expression> ExpressionOr::optimize() {
+ /* optimize the disjunction as much as possible */
+ intrusive_ptr<Expression> pE(ExpressionNary::optimize());
+
+ /* if the result isn't a conjunction, we can't do anything */
+ ExpressionOr *pOr = dynamic_cast<ExpressionOr *>(pE.get());
+ if (!pOr)
+ return pE;
+
+ /*
+ Check the last argument on the result; if it's not constant (as
+ promised by ExpressionNary::optimize(),) then there's nothing
+ we can do.
+ */
+ const size_t n = pOr->vpOperand.size();
+ intrusive_ptr<Expression> pLast(pOr->vpOperand[n - 1]);
+ const ExpressionConstant *pConst =
+ dynamic_cast<ExpressionConstant *>(pLast.get());
+ if (!pConst)
+ return pE;
+
+ /*
+ Evaluate and coerce the last argument to a boolean. If it's true,
+ then we can replace this entire expression.
+ */
+ bool last = pLast->evaluate(intrusive_ptr<Document>())->coerceToBool();
+ if (last) {
+ intrusive_ptr<ExpressionConstant> pFinal(
+ ExpressionConstant::create(Value::getTrue()));
+ return pFinal;
+ }
+
+ /*
+ If we got here, the final operand was false, so we don't need it
+ anymore. If there was only one other operand, we don't need the
+ conjunction either. Note we still need to keep the promise that
+ the result will be a boolean.
+ */
+ if (n == 2) {
+ intrusive_ptr<Expression> pFinal(
+ ExpressionCoerceToBool::create(pOr->vpOperand[0]));
+ return pFinal;
+ }
+
+ /*
+ Remove the final "false" value, and return the new expression.
+ */
+ pOr->vpOperand.resize(n - 1);
+ return pE;
+ }
+
+ const char *ExpressionOr::getOpName() const {
+ return "$or";
+ }
+
+ /* ------------------------- ExpressionSecond ----------------------------- */
+
+ ExpressionSecond::~ExpressionSecond() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionSecond::create() {
+ intrusive_ptr<ExpressionSecond> pExpression(new ExpressionSecond());
+ return pExpression;
+ }
+
+ ExpressionSecond::ExpressionSecond():
+ ExpressionNary() {
+ }
+
+ void ExpressionSecond::addOperand(const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionSecond::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+ tm date;
+ (pDate->coerceToDate()).toTm(&date);
+ return Value::createInt(date.tm_sec);
+ }
+
+ const char *ExpressionSecond::getOpName() const {
+ return "$second";
+ }
+
+ /* ----------------------- ExpressionStrcasecmp ---------------------------- */
+
+ ExpressionStrcasecmp::~ExpressionStrcasecmp() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionStrcasecmp::create() {
+ intrusive_ptr<ExpressionStrcasecmp> pExpression(new ExpressionStrcasecmp());
+ return pExpression;
+ }
+
+ ExpressionStrcasecmp::ExpressionStrcasecmp():
+ ExpressionNary() {
+ }
+
+ void ExpressionStrcasecmp::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(2);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionStrcasecmp::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(2);
+ intrusive_ptr<const Value> pString1(vpOperand[0]->evaluate(pDocument));
+ intrusive_ptr<const Value> pString2(vpOperand[1]->evaluate(pDocument));
+
+ /* boost::iequals returns a bool not an int so strings must actually be allocated */
+ string str1 = boost::to_upper_copy( pString1->coerceToString() );
+ string str2 = boost::to_upper_copy( pString2->coerceToString() );
+ int result = str1.compare(str2);
+
+ if (result == 0)
+ return Value::getZero();
+ if (result > 0)
+ return Value::getOne();
+ return Value::getMinusOne();
+ }
+
+ const char *ExpressionStrcasecmp::getOpName() const {
+ return "$strcasecmp";
+ }
+
+ /* ----------------------- ExpressionSubstr ---------------------------- */
+
+ ExpressionSubstr::~ExpressionSubstr() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionSubstr::create() {
+ intrusive_ptr<ExpressionSubstr> pExpression(new ExpressionSubstr());
+ return pExpression;
+ }
+
+ ExpressionSubstr::ExpressionSubstr():
+ ExpressionNary() {
+ }
+
+ void ExpressionSubstr::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(3);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionSubstr::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(3);
+ intrusive_ptr<const Value> pString(vpOperand[0]->evaluate(pDocument));
+ intrusive_ptr<const Value> pLower(vpOperand[1]->evaluate(pDocument));
+ intrusive_ptr<const Value> pLength(vpOperand[2]->evaluate(pDocument));
+
+ string str = pString->coerceToString();
+ uassert(16034, str::stream() << getOpName() <<
+ ": starting index must be a numeric type (is BSON type " <<
+ pLower->getType() << ")",
+ (pLower->getType() == NumberInt
+ || pLower->getType() == NumberLong
+ || pLower->getType() == NumberDouble));
+ uassert(16035, str::stream() << getOpName() <<
+ ": length must be a numeric type (is BSON type " <<
+ pLength->getType() << ")",
+ (pLength->getType() == NumberInt
+ || pLength->getType() == NumberLong
+ || pLength->getType() == NumberDouble));
+ string::size_type lower = static_cast< string::size_type >( pLower->coerceToLong() );
+ string::size_type length = static_cast< string::size_type >( pLength->coerceToLong() );
+ return Value::createString( str.substr(lower, length) );
+ }
+
+ const char *ExpressionSubstr::getOpName() const {
+ return "$substr";
+ }
+
+ /* ----------------------- ExpressionSubtract ---------------------------- */
+
+ ExpressionSubtract::~ExpressionSubtract() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionSubtract::create() {
+ intrusive_ptr<ExpressionSubtract> pExpression(new ExpressionSubtract());
+ return pExpression;
+ }
+
+ ExpressionSubtract::ExpressionSubtract():
+ ExpressionNary() {
+ }
+
+ void ExpressionSubtract::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(2);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionSubtract::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ BSONType productType;
+ checkArgCount(2);
+ intrusive_ptr<const Value> pLeft(vpOperand[0]->evaluate(pDocument));
+ intrusive_ptr<const Value> pRight(vpOperand[1]->evaluate(pDocument));
+ if (pLeft->getType() == Date) {
+ long long right;
+ long long left = pLeft->coerceToDate();
+ if (pRight->getType() == Date)
+ right = pRight->coerceToDate();
+ else
+ right = static_cast<long long>(pRight->coerceToDouble()*24*60*60*1000);
+ return Value::createDate(Date_t(left-right));
+ }
+
+ uassert(15996, "cannot subtract one date from another",
+ pRight->getType() != Date);
+
+ productType = Value::getWidestNumeric(
+ pRight->getType(), pLeft->getType());
+
+
+ if (productType == NumberDouble) {
+ double right = pRight->coerceToDouble();
+ double left = pLeft->coerceToDouble();
+ return Value::createDouble(left - right);
+ }
+
+ long long right = pRight->coerceToLong();
+ long long left = pLeft->coerceToLong();
+ if (productType == NumberLong)
+ return Value::createLong(left - right);
+ return Value::createInt((int)(left - right));
+ }
+
+ const char *ExpressionSubtract::getOpName() const {
+ return "$subtract";
+ }
+
+ /* ------------------------- ExpressionToLower ----------------------------- */
+
+ ExpressionToLower::~ExpressionToLower() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionToLower::create() {
+ intrusive_ptr<ExpressionToLower> pExpression(new ExpressionToLower());
+ return pExpression;
+ }
+
+ ExpressionToLower::ExpressionToLower():
+ ExpressionNary() {
+ }
+
+ void ExpressionToLower::addOperand(const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionToLower::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pString(vpOperand[0]->evaluate(pDocument));
+ string str = pString->coerceToString();
+ boost::to_lower(str);
+ return Value::createString(str);
+ }
+
+ const char *ExpressionToLower::getOpName() const {
+ return "$toLower";
+ }
+
+ /* ------------------------- ExpressionToUpper -------------------------- */
+
+ ExpressionToUpper::~ExpressionToUpper() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionToUpper::create() {
+ intrusive_ptr<ExpressionToUpper> pExpression(new ExpressionToUpper());
+ return pExpression;
+ }
+
+ ExpressionToUpper::ExpressionToUpper():
+ ExpressionNary() {
+ }
+
+ void ExpressionToUpper::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionToUpper::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pString(vpOperand[0]->evaluate(pDocument));
+ string str(pString->coerceToString());
+ boost::to_upper(str);
+ return Value::createString(str);
+ }
+
+ const char *ExpressionToUpper::getOpName() const {
+ return "$toUpper";
+ }
+
+ /* ------------------------- ExpressionWeek ----------------------------- */
+
+ ExpressionWeek::~ExpressionWeek() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionWeek::create() {
+ intrusive_ptr<ExpressionWeek> pExpression(new ExpressionWeek());
+ return pExpression;
+ }
+
+ ExpressionWeek::ExpressionWeek():
+ ExpressionNary() {
+ }
+
+ void ExpressionWeek::addOperand(const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionWeek::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+ tm date;
+ (pDate->coerceToDate()).toTm(&date);
+ int dayOfWeek = date.tm_wday+1;
+ int dayOfYear = date.tm_yday;
+ int week = 0;
+ int janFirst = 0;
+ int offset = 0;
+
+ janFirst = dayOfWeek - dayOfYear % 7;
+ offset = (janFirst + 6) % 7;
+ week = (dayOfYear + offset) / 7;
+ return Value::createInt(week);
+ }
+
+ const char *ExpressionWeek::getOpName() const {
+ return "$week";
+ }
+
+ /* ------------------------- ExpressionYear ----------------------------- */
+
+ ExpressionYear::~ExpressionYear() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionYear::create() {
+ intrusive_ptr<ExpressionYear> pExpression(new ExpressionYear());
+ return pExpression;
+ }
+
+ ExpressionYear::ExpressionYear():
+ ExpressionNary() {
+ }
+
+ void ExpressionYear::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionYear::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+ tm date;
+ (pDate->coerceToDate()).toTm(&date);
+ return Value::createInt(date.tm_year+1900); // tm_year is years since 1900
+ }
+
+ const char *ExpressionYear::getOpName() const {
+ return "$year";
+ }
+}
diff --git a/src/mongo/db/pipeline/expression.h b/src/mongo/db/pipeline/expression.h
new file mode 100755
index 00000000000..c49e385a3c7
--- /dev/null
+++ b/src/mongo/db/pipeline/expression.h
@@ -0,0 +1,1223 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+#include "db/pipeline/field_path.h"
+#include "util/intrusive_counter.h"
+
+
+namespace mongo {
+ class BSONArrayBuilder;
+ class BSONElement;
+ class BSONObjBuilder;
+ class Builder;
+ class Document;
+ class ExpressionContext;
+ class Value;
+
+ class Expression :
+ public IntrusiveCounterUnsigned {
+ public:
+ virtual ~Expression() {};
+
+ /*
+ Optimize the Expression.
+
+ This provides an opportunity to do constant folding, or to
+ collapse nested operators that have the same precedence, such as
+ $add, $and, or $or.
+
+ The Expression should be replaced with the return value, which may
+ or may not be the same object. In the case of constant folding,
+ a computed expression may be replaced by a constant.
+
+ @returns the optimized Expression
+ */
+ virtual intrusive_ptr<Expression> optimize() = 0;
+
+ /*
+ Evaluate the Expression using the given document as input.
+
+ @returns the computed value
+ */
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const = 0;
+
+ /*
+ Add the Expression (and any descendant Expressions) into a BSON
+ object that is under construction.
+
+ Unevaluated Expressions always materialize as objects. Evaluation
+ may produce a scalar or another object, either of which will be
+ substituted inline.
+
+ @param pBuilder the builder to add the expression to
+ @param fieldName the name the object should be given
+ */
+ virtual void addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName,
+ unsigned depth) const = 0;
+
+ /*
+ Add the Expression (and any descendant Expressions) into a BSON
+ array that is under construction.
+
+ Unevaluated Expressions always materialize as objects. Evaluation
+ may produce a scalar or another object, either of which will be
+ substituted inline.
+
+ @param pBuilder the builder to add the expression to
+ */
+ virtual void addToBsonArray(BSONArrayBuilder *pBuilder,
+ unsigned depth) const = 0;
+
+ /*
+ Convert the expression into a BSONObj that corresponds to the
+ db.collection.find() predicate language. This is intended for
+ use by DocumentSourceFilter.
+
+ This is more limited than the full expression language supported
+ by all available expressions in a DocumentSource processing
+ pipeline, and will fail with an assertion if an attempt is made
+ to go outside the bounds of the recognized patterns, which don't
+ include full computed expressions. There are other methods available
+ on DocumentSourceFilter which can be used to analyze a filter
+ predicate and break it up into appropriate expressions which can
+ be translated within these constraints. As a result, the default
+ implementation is to fail with an assertion; only a subset of
+ operators will be able to fulfill this request.
+
+ @param pBuilder the builder to add the expression to.
+ */
+ virtual void toMatcherBson(
+ BSONObjBuilder *pBuilder, unsigned depth) const;
+
+ /*
+ Utility class for parseObject() below.
+
+ Only one array can be unwound in a processing pipeline. If the
+ UNWIND_OK option is used, unwindOk() will return true, and a field
+ can be declared as unwound using unwind(), after which unwindUsed()
+ will return true. Only specify UNWIND_OK if it is OK to unwind an
+ array in the current context.
+
+ DOCUMENT_OK indicates that it is OK to use a Document in the current
+ context.
+ */
+ class ObjectCtx {
+ public:
+ ObjectCtx(int options);
+ static const int UNWIND_OK = 0x0001;
+ static const int DOCUMENT_OK = 0x0002;
+
+ bool unwindOk() const;
+ bool unwindUsed() const;
+ void unwind(string fieldName);
+
+ bool documentOk() const;
+
+ private:
+ int options;
+ string unwindField;
+ };
+
+ /*
+ Parse a BSONElement Object. The object could represent a functional
+ expression or a Document expression.
+
+ @param pBsonElement the element representing the object
+ @param pCtx a MiniCtx representing the options above
+ @returns the parsed Expression
+ */
+ static intrusive_ptr<Expression> parseObject(
+ BSONElement *pBsonElement, ObjectCtx *pCtx);
+
+ static const char unwindName[];
+
+ /*
+ Parse a BSONElement Object which has already been determined to be
+ functional expression.
+
+ @param pOpName the name of the (prefix) operator
+ @param pBsonElement the BSONElement to parse
+ @returns the parsed Expression
+ */
+ static intrusive_ptr<Expression> parseExpression(
+ const char *pOpName, BSONElement *pBsonElement);
+
+
+ /*
+ Parse a BSONElement which is an operand in an Expression.
+
+ @param pBsonElement the expected operand's BSONElement
+ @returns the parsed operand, as an Expression
+ */
+ static intrusive_ptr<Expression> parseOperand(
+ BSONElement *pBsonElement);
+
+ /*
+ Produce a field path string with the field prefix removed.
+
+ Throws an error if the field prefix is not present.
+
+ @param prefixedField the prefixed field
+ @returns the field path with the prefix removed
+ */
+ static string removeFieldPrefix(const string &prefixedField);
+
+ /*
+ Enumeration of comparison operators. These are shared between a
+ few expression implementations, so they are factored out here.
+
+ Any changes to these values require adjustment of the lookup
+ table in the implementation.
+ */
+ enum CmpOp {
+ EQ = 0, // return true for a == b, false otherwise
+ NE = 1, // return true for a != b, false otherwise
+ GT = 2, // return true for a > b, false otherwise
+ GTE = 3, // return true for a >= b, false otherwise
+ LT = 4, // return true for a < b, false otherwise
+ LTE = 5, // return true for a <= b, false otherwise
+ CMP = 6, // return -1, 0, 1 for a < b, a == b, a > b
+ };
+
+ static int signum(int i);
+ };
+
+
+ class ExpressionNary :
+ public Expression,
+ public boost::enable_shared_from_this<ExpressionNary> {
+ public:
+ // virtuals from Expression
+ virtual intrusive_ptr<Expression> optimize();
+ virtual void addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+ virtual void addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+ /*
+ Add an operand to the n-ary expression.
+
+ @param pExpression the expression to add
+ */
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ /*
+ Return a factory function that will make Expression nodes of
+ the same type as this. This will be used to create constant
+ expressions for constant folding for optimize(). Only return
+ a factory function if this operator is both associative and
+ commutative. The default implementation returns NULL; optimize()
+ will recognize that and stop.
+
+ Note that ExpressionNary::optimize() promises that if it uses this
+ to fold constants, then if optimize() returns an ExpressionNary,
+ any remaining constant will be the last one in vpOperand. Derived
+ classes may take advantage of this to do further optimizations in
+ their optimize().
+
+ @returns pointer to a factory function or NULL
+ */
+ virtual intrusive_ptr<ExpressionNary> (*getFactory() const)();
+
+ /*
+ Get the name of the operator.
+
+ @returns the name of the operator; this string belongs to the class
+ implementation, and should not be deleted
+ and should not
+ */
+ virtual const char *getOpName() const = 0;
+
+ protected:
+ ExpressionNary();
+
+ vector<intrusive_ptr<Expression> > vpOperand;
+
+ /*
+ Add the expression to the builder.
+
+ If there is only one operand (a unary operator), then the operand
+ is added directly, without an array. For more than one operand,
+ a named array is created. In both cases, the result is an object.
+
+ @param pBuilder the (blank) builder to add the expression to
+ @param pOpName the name of the operator
+ */
+ virtual void toBson(BSONObjBuilder *pBuilder,
+ const char *pOpName, unsigned depth) const;
+
+ /*
+ Checks the current size of vpOperand; if the size equal to or
+ greater than maxArgs, fires a user assertion indicating that this
+ operator cannot have this many arguments.
+
+ The equal is there because this is intended to be used in
+ addOperand() to check for the limit *before* adding the requested
+ argument.
+
+ @param maxArgs the maximum number of arguments the operator accepts
+ */
+ void checkArgLimit(unsigned maxArgs) const;
+
+ /*
+ Checks the current size of vpOperand; if the size is not equal to
+ reqArgs, fires a user assertion indicating that this must have
+ exactly reqArgs arguments.
+
+ This is meant to be used in evaluate(), *before* the evaluation
+ takes place.
+
+ @param reqArgs the number of arguments this operator requires
+ */
+ void checkArgCount(unsigned reqArgs) const;
+ };
+
+
+ class ExpressionAdd :
+ public ExpressionNary {
+ public:
+ // virtuals from Expression
+ virtual ~ExpressionAdd();
+ virtual intrusive_ptr<Expression> optimize();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+
+ // virtuals from ExpressionNary
+ virtual intrusive_ptr<ExpressionNary> (*getFactory() const)();
+
+ /*
+ Create an expression that finds the sum of n operands.
+
+ @returns addition expression
+ */
+ static intrusive_ptr<ExpressionNary> create();
+
+ protected:
+ // virtuals from ExpressionNary
+ virtual void toBson(BSONObjBuilder *pBuilder,
+ const char *pOpName, unsigned depth) const;
+
+ private:
+ ExpressionAdd();
+
+ /*
+ If the operator can be optimized, we save the original here.
+
+ This is necessary because addition must follow its original operand
+ ordering strictly if a string is detected, otherwise string
+ concatenation may appear to have re-ordered the operands.
+ */
+ intrusive_ptr<ExpressionAdd> pAdd;
+ mutable bool useOriginal;
+ };
+
+
+ class ExpressionAnd :
+ public ExpressionNary {
+ public:
+ // virtuals from Expression
+ virtual ~ExpressionAnd();
+ virtual intrusive_ptr<Expression> optimize();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void toMatcherBson(
+ BSONObjBuilder *pBuilder, unsigned depth) const;
+
+ // virtuals from ExpressionNary
+ virtual intrusive_ptr<ExpressionNary> (*getFactory() const)();
+
+ /*
+ Create an expression that finds the conjunction of n operands.
+ The conjunction uses short-circuit logic; the expressions are
+ evaluated in the order they were added to the conjunction, and
+ the evaluation stops and returns false on the first operand that
+ evaluates to false.
+
+ @returns conjunction expression
+ */
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionAnd();
+ };
+
+
+ class ExpressionCoerceToBool :
+ public Expression,
+ public boost::enable_shared_from_this<ExpressionCoerceToBool> {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionCoerceToBool();
+ virtual intrusive_ptr<Expression> optimize();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual void addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+ virtual void addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+ static intrusive_ptr<ExpressionCoerceToBool> create(
+ const intrusive_ptr<Expression> &pExpression);
+
+ private:
+ ExpressionCoerceToBool(const intrusive_ptr<Expression> &pExpression);
+
+ intrusive_ptr<Expression> pExpression;
+ };
+
+
+ class ExpressionCompare :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionCompare();
+ virtual intrusive_ptr<Expression> optimize();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ /*
+ Shorthands for creating various comparisons expressions.
+ Provide for conformance with the uniform function pointer signature
+ required for parsing.
+
+ These create a particular comparision operand, without any
+ operands. Those must be added via ExpressionNary::addOperand().
+ */
+ static intrusive_ptr<ExpressionNary> createCmp();
+ static intrusive_ptr<ExpressionNary> createEq();
+ static intrusive_ptr<ExpressionNary> createNe();
+ static intrusive_ptr<ExpressionNary> createGt();
+ static intrusive_ptr<ExpressionNary> createGte();
+ static intrusive_ptr<ExpressionNary> createLt();
+ static intrusive_ptr<ExpressionNary> createLte();
+
+ private:
+ friend class ExpressionFieldRange;
+ ExpressionCompare(CmpOp cmpOp);
+
+ CmpOp cmpOp;
+ };
+
+
+ class ExpressionCond :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionCond();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionCond();
+ };
+
+
+ class ExpressionConstant :
+ public Expression,
+ public boost::enable_shared_from_this<ExpressionConstant> {
+ public:
+ // virtuals from Expression
+ virtual ~ExpressionConstant();
+ virtual intrusive_ptr<Expression> optimize();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+ virtual void addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+ static intrusive_ptr<ExpressionConstant> createFromBsonElement(
+ BSONElement *pBsonElement);
+ static intrusive_ptr<ExpressionConstant> create(
+ const intrusive_ptr<const Value> &pValue);
+
+ /*
+ Get the constant value represented by this Expression.
+
+ @returns the value
+ */
+ intrusive_ptr<const Value> getValue() const;
+
+ private:
+ ExpressionConstant(BSONElement *pBsonElement);
+ ExpressionConstant(const intrusive_ptr<const Value> &pValue);
+
+ intrusive_ptr<const Value> pValue;
+ };
+
+
+ class ExpressionDayOfMonth :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionDayOfMonth();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionDayOfMonth();
+ };
+
+
+ class ExpressionDayOfWeek :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionDayOfWeek();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionDayOfWeek();
+ };
+
+
+ class ExpressionDayOfYear :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionDayOfYear();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionDayOfYear();
+ };
+
+
+ class ExpressionDivide :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionDivide();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionDivide();
+ };
+
+
+ class ExpressionFieldPath :
+ public Expression,
+ public boost::enable_shared_from_this<ExpressionFieldPath> {
+ public:
+ // virtuals from Expression
+ virtual ~ExpressionFieldPath();
+ virtual intrusive_ptr<Expression> optimize();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual void addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+ virtual void addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+ /*
+ Create a field path expression.
+
+ Evaluation will extract the value associated with the given field
+ path from the source document.
+
+ @param fieldPath the field path string, without any leading document
+ indicator
+ @returns the newly created field path expression
+ */
+ static intrusive_ptr<ExpressionFieldPath> create(
+ const string &fieldPath);
+
+ /*
+ Return a string representation of the field path.
+
+ @param fieldPrefix whether or not to include the document field
+ indicator prefix
+ @returns the dot-delimited field path
+ */
+ string getFieldPath(bool fieldPrefix) const;
+
+ /*
+ Write a string representation of the field path to a stream.
+
+ @param the stream to write to
+ @param fieldPrefix whether or not to include the document field
+ indicator prefix
+ */
+ void writeFieldPath(ostream &outStream, bool fieldPrefix) const;
+
+ private:
+ ExpressionFieldPath(const string &fieldPath);
+
+ /*
+ Internal implementation of evaluate(), used recursively.
+
+ The internal implementation doesn't just use a loop because of
+ the possibility that we need to skip over an array. If the path
+ is "a.b.c", and a is an array, then we fan out from there, and
+ traverse "b.c" for each element of a:[...]. This requires that
+ a be an array of objects in order to navigate more deeply.
+
+ @param index current path field index to extract
+ @param pathLength maximum number of fields on field path
+ @param pDocument current document traversed to (not the top-level one)
+ @returns the field found; could be an array
+ */
+ intrusive_ptr<const Value> evaluatePath(
+ size_t index, const size_t pathLength,
+ intrusive_ptr<Document> pDocument) const;
+
+ FieldPath fieldPath;
+ };
+
+
+ class ExpressionFieldRange :
+ public Expression,
+ public boost::enable_shared_from_this<ExpressionFieldRange> {
+ public:
+ // virtuals from expression
+ virtual ~ExpressionFieldRange();
+ virtual intrusive_ptr<Expression> optimize();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual void addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+ virtual void addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const;
+ virtual void toMatcherBson(
+ BSONObjBuilder *pBuilder, unsigned depth) const;
+
+ /*
+ Create a field range expression.
+
+ Field ranges are meant to match up with classic Matcher semantics,
+ and therefore are conjunctions. For example, these appear in
+ mongo shell predicates in one of these forms:
+ { a : C } -> (a == C) // degenerate "point" range
+ { a : { $lt : C } } -> (a < C) // open range
+ { a : { $gt : C1, $lte : C2 } } -> ((a > C1) && (a <= C2)) // closed
+
+ When initially created, a field range only includes one end of
+ the range. Additional points may be added via intersect().
+
+ Note that NE and CMP are not supported.
+
+ @param pFieldPath the field path for extracting the field value
+ @param cmpOp the comparison operator
+ @param pValue the value to compare against
+ @returns the newly created field range expression
+ */
+ static intrusive_ptr<ExpressionFieldRange> create(
+ const intrusive_ptr<ExpressionFieldPath> &pFieldPath,
+ CmpOp cmpOp, const intrusive_ptr<const Value> &pValue);
+
+ /*
+ Add an intersecting range.
+
+ This can be done any number of times after creation. The
+ range is internally optimized for each new addition. If the new
+ intersection extends or reduces the values within the range, the
+ internal representation is adjusted to reflect that.
+
+ Note that NE and CMP are not supported.
+
+ @param cmpOp the comparison operator
+ @param pValue the value to compare against
+ */
+ void intersect(CmpOp cmpOp, const intrusive_ptr<const Value> &pValue);
+
+ private:
+ ExpressionFieldRange(const intrusive_ptr<ExpressionFieldPath> &pFieldPath,
+ CmpOp cmpOp,
+ const intrusive_ptr<const Value> &pValue);
+
+ intrusive_ptr<ExpressionFieldPath> pFieldPath;
+
+ class Range {
+ public:
+ Range(CmpOp cmpOp, const intrusive_ptr<const Value> &pValue);
+ Range(const Range &rRange);
+
+ Range *intersect(const Range *pRange) const;
+ bool contains(const intrusive_ptr<const Value> &pValue) const;
+
+ Range(const intrusive_ptr<const Value> &pBottom, bool bottomOpen,
+ const intrusive_ptr<const Value> &pTop, bool topOpen);
+
+ bool bottomOpen;
+ bool topOpen;
+ intrusive_ptr<const Value> pBottom;
+ intrusive_ptr<const Value> pTop;
+ };
+
+ scoped_ptr<Range> pRange;
+
+ /*
+ Add to a generic Builder.
+
+ The methods to append items to an object and an array differ by
+ their inclusion of a field name. For more complicated objects,
+ it makes sense to abstract that out and use a generic builder that
+ always looks the same, and then implement addToBsonObj() and
+ addToBsonArray() by using the common method.
+ */
+ void addToBson(Builder *pBuilder, unsigned depth) const;
+ };
+
+
+ class ExpressionHour :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionHour();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionHour();
+ };
+
+
+ class ExpressionIfNull :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionIfNull();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionIfNull();
+ };
+
+
+ class ExpressionMinute :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionMinute();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionMinute();
+ };
+
+
+ class ExpressionMod :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionMod();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionMod();
+ };
+
+
+ class ExpressionMultiply :
+ public ExpressionNary {
+ public:
+ // virtuals from Expression
+ virtual ~ExpressionMultiply();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+
+ // virtuals from ExpressionNary
+ virtual intrusive_ptr<ExpressionNary> (*getFactory() const)();
+
+ /*
+ Create an expression that finds the product of n operands.
+
+ @returns multiplication expression
+ */
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionMultiply();
+ };
+
+
+ class ExpressionMonth :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionMonth();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionMonth();
+ };
+
+
+ class ExpressionNoOp :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionNoOp();
+ virtual intrusive_ptr<Expression> optimize();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionNoOp();
+ };
+
+
+ class ExpressionNot :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionNot();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionNot();
+ };
+
+
+ class ExpressionObject :
+ public Expression,
+ public boost::enable_shared_from_this<ExpressionObject> {
+ public:
+ // virtuals from Expression
+ virtual ~ExpressionObject();
+ virtual intrusive_ptr<Expression> optimize();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual void addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+ virtual void addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+ /*
+ evaluate(), but return a Document instead of a Value-wrapped
+ Document.
+
+ @param pDocument the input Document
+ @returns the result document
+ */
+ intrusive_ptr<Document> evaluateDocument(
+ const intrusive_ptr<Document> &pDocument) const;
+
+ /*
+ evaluate(), but add the evaluated fields to a given document
+ instead of creating a new one.
+
+ @param pResult the Document to add the evaluated expressions to
+ @param pDocument the input Document
+ */
+ void addToDocument(const intrusive_ptr<Document> &pResult,
+ const intrusive_ptr<Document> &pDocument) const;
+
+ /*
+ Estimate the number of fields that will result from evaluating
+ this over pDocument. Does not include _id. This is an estimate
+ (really an upper bound) because we can't account for undefined
+ fields without actually doing the evaluation. But this is still
+ useful as an argument to Document::create(), if you plan to use
+ addToDocument().
+
+ @param pDocument the input document
+ @returns estimated number of fields that will result
+ */
+ size_t getSizeHint(const intrusive_ptr<Document> &pDocument) const;
+
+ /*
+ Create an empty expression. Until fields are added, this
+ will evaluate to an empty document (object).
+ */
+ static intrusive_ptr<ExpressionObject> create();
+
+ /*
+ Add a field to the document expression.
+
+ @param fieldPath the path the evaluated expression will have in the
+ result Document
+ @param pExpression the expression to evaluate obtain this field's
+ Value in the result Document
+ */
+ void addField(const string &fieldPath,
+ const intrusive_ptr<Expression> &pExpression);
+
+ /*
+ Add a field path to the set of those to be included.
+
+ Note that including a nested field implies including everything on
+ the path leading down to it.
+
+ @param fieldPath the name of the field to be included
+ */
+ void includePath(const string &fieldPath);
+
+ /*
+ Add a field path to the set of those to be excluded.
+
+ Note that excluding a nested field implies including everything on
+ the path leading down to it (because you're stating you want to see
+ all the other fields that aren't being excluded).
+
+ @param fieldName the name of the field to be excluded
+ */
+ void excludePath(const string &fieldPath);
+
+ /*
+ Return the expression for a field.
+
+ @param fieldName the field name for the expression to return
+ @returns the expression used to compute the field, if it is present,
+ otherwise NULL.
+ */
+ intrusive_ptr<Expression> getField(const string &fieldName) const;
+
+ /*
+ Get a count of the added fields.
+
+ @returns how many fields have been added
+ */
+ size_t getFieldCount() const;
+
+ /*
+ Get a count of the exclusions.
+
+ @returns how many fields have been excluded.
+ */
+ size_t getExclusionCount() const;
+
+ /*
+ Specialized BSON conversion that allows for writing out a
+ $project specification. This creates a standalone object, which must
+ be added to a containing object with a name
+
+ @param pBuilder where to write the object to
+ */
+ void documentToBson(BSONObjBuilder *pBuilder, unsigned depth) const;
+
+ private:
+ ExpressionObject();
+
+ void includePath(
+ const FieldPath *pPath, size_t pathi, size_t pathn,
+ bool excludeLast);
+
+ bool excludePaths;
+ set<string> path;
+
+ /* these two vectors are maintained in parallel */
+ vector<string> vFieldName;
+ vector<intrusive_ptr<Expression> > vpExpression;
+
+ /*
+ Utility function used by documentToBson(). Emits inclusion
+ and exclusion paths by recursively walking down the nested
+ ExpressionObject trees these have created.
+
+ @param pBuilder the builder to write boolean valued path "fields" to
+ @param pvPath pointer to a vector of strings describing the path on
+ descent; the top-level call should pass an empty vector
+ */
+ void emitPaths(BSONObjBuilder *pBuilder, vector<string> *pvPath) const;
+
+ /* utility class used by emitPaths() */
+ class PathPusher :
+ boost::noncopyable {
+ public:
+ PathPusher(vector<string> *pvPath, const string &s);
+ ~PathPusher();
+
+ private:
+ vector<string> *pvPath;
+ };
+ };
+
+
+ class ExpressionOr :
+ public ExpressionNary {
+ public:
+ // virtuals from Expression
+ virtual ~ExpressionOr();
+ virtual intrusive_ptr<Expression> optimize();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void toMatcherBson(
+ BSONObjBuilder *pBuilder, unsigned depth) const;
+
+ // virtuals from ExpressionNary
+ virtual intrusive_ptr<ExpressionNary> (*getFactory() const)();
+
+ /*
+ Create an expression that finds the conjunction of n operands.
+ The conjunction uses short-circuit logic; the expressions are
+ evaluated in the order they were added to the conjunction, and
+ the evaluation stops and returns false on the first operand that
+ evaluates to false.
+
+ @returns conjunction expression
+ */
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionOr();
+ };
+
+
+ class ExpressionSecond :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionSecond();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionSecond();
+ };
+
+
+ class ExpressionStrcasecmp :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionStrcasecmp();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionStrcasecmp();
+ };
+
+
+ class ExpressionSubstr :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionSubstr();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionSubstr();
+ };
+
+
+ class ExpressionSubtract :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionSubtract();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionSubtract();
+ };
+
+
+ class ExpressionToLower :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionToLower();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionToLower();
+ };
+
+
+ class ExpressionToUpper :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionToUpper();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionToUpper();
+ };
+
+
+ class ExpressionWeek :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionWeek();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionWeek();
+ };
+
+
+ class ExpressionYear :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionYear();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionYear();
+ };
+}
+
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+ inline bool Expression::ObjectCtx::unwindOk() const {
+ return ((options & UNWIND_OK) != 0);
+ }
+
+ inline bool Expression::ObjectCtx::unwindUsed() const {
+ return (unwindField.size() != 0);
+ }
+
+ inline int Expression::signum(int i) {
+ if (i < 0)
+ return -1;
+ if (i > 0)
+ return 1;
+ return 0;
+ }
+
+ inline intrusive_ptr<const Value> ExpressionConstant::getValue() const {
+ return pValue;
+ }
+
+ inline string ExpressionFieldPath::getFieldPath(bool fieldPrefix) const {
+ return fieldPath.getPath(fieldPrefix);
+ }
+
+ inline void ExpressionFieldPath::writeFieldPath(
+ ostream &outStream, bool fieldPrefix) const {
+ return fieldPath.writePath(outStream, fieldPrefix);
+ }
+
+ inline size_t ExpressionObject::getFieldCount() const {
+ return vFieldName.size();
+ }
+
+ inline ExpressionObject::PathPusher::PathPusher(
+ vector<string> *pTheVPath, const string &s):
+ pvPath(pTheVPath) {
+ pvPath->push_back(s);
+ }
+
+ inline ExpressionObject::PathPusher::~PathPusher() {
+ pvPath->pop_back();
+ }
+
+}
diff --git a/src/mongo/db/pipeline/expression_context.cpp b/src/mongo/db/pipeline/expression_context.cpp
new file mode 100755
index 00000000000..4835dcfa5a9
--- /dev/null
+++ b/src/mongo/db/pipeline/expression_context.cpp
@@ -0,0 +1,35 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/pipeline/expression_context.h"
+
+namespace mongo {
+
+ ExpressionContext::~ExpressionContext() {
+ }
+
+ inline ExpressionContext::ExpressionContext():
+ inShard(false),
+ inRouter(false) {
+ }
+
+ ExpressionContext *ExpressionContext::create() {
+ return new ExpressionContext();
+ }
+
+}
diff --git a/src/mongo/db/pipeline/expression_context.h b/src/mongo/db/pipeline/expression_context.h
new file mode 100755
index 00000000000..0277039c80b
--- /dev/null
+++ b/src/mongo/db/pipeline/expression_context.h
@@ -0,0 +1,67 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+#include "util/intrusive_counter.h"
+
+namespace mongo {
+
+ class ExpressionContext :
+ public IntrusiveCounterUnsigned {
+ public:
+ virtual ~ExpressionContext();
+
+ void setInShard(bool b);
+ void setInRouter(bool b);
+
+ bool getInShard() const;
+ bool getInRouter() const;
+
+ static ExpressionContext *create();
+
+ private:
+ ExpressionContext();
+
+ bool inShard;
+ bool inRouter;
+ };
+}
+
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+ inline void ExpressionContext::setInShard(bool b) {
+ inShard = b;
+ }
+
+ inline void ExpressionContext::setInRouter(bool b) {
+ inRouter = b;
+ }
+
+ inline bool ExpressionContext::getInShard() const {
+ return inShard;
+ }
+
+ inline bool ExpressionContext::getInRouter() const {
+ return inRouter;
+ }
+
+};
diff --git a/src/mongo/db/pipeline/field_path.cpp b/src/mongo/db/pipeline/field_path.cpp
new file mode 100755
index 00000000000..96e1fc92f83
--- /dev/null
+++ b/src/mongo/db/pipeline/field_path.cpp
@@ -0,0 +1,87 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/field_path.h"
+#include "util/mongoutils/str.h"
+
+namespace mongo {
+ using namespace mongoutils;
+
+ FieldPath::~FieldPath() {
+ }
+
+ FieldPath::FieldPath():
+ vFieldName() {
+ }
+
+ FieldPath::FieldPath(const string &fieldPath):
+ vFieldName() {
+ /*
+ The field path could be using dot notation.
+ Break the field path up by peeling off successive pieces.
+ */
+ size_t startpos = 0;
+ while(true) {
+ /* find the next dot */
+ const size_t dotpos = fieldPath.find('.', startpos);
+
+ /* if there are no more dots, use the remainder of the string */
+ if (dotpos == fieldPath.npos) {
+ vFieldName.push_back(fieldPath.substr(startpos, dotpos));
+ break;
+ }
+
+ /* use the string up to the dot */
+ const size_t length = dotpos - startpos;
+ uassert(15998, str::stream() <<
+ "field names cannot be zero length (in path \"" <<
+ fieldPath << "\")",
+ length > 0);
+
+ vFieldName.push_back(fieldPath.substr(startpos, length));
+
+ /* next time, search starting one spot after that */
+ startpos = dotpos + 1;
+ }
+ }
+
+ string FieldPath::getPath(bool fieldPrefix) const {
+ stringstream ss;
+ writePath(ss, fieldPrefix);
+ return ss.str();
+ }
+
+ void FieldPath::writePath(ostream &outStream, bool fieldPrefix) const {
+ if (fieldPrefix)
+ outStream << "$";
+
+ outStream << vFieldName[0];
+
+ const size_t n = vFieldName.size();
+ for(size_t i = 1; i < n; ++i)
+ outStream << "." << vFieldName[i];
+ }
+
+ FieldPath &FieldPath::operator=(const FieldPath &rRHS) {
+ if (this != &rRHS) {
+ vFieldName = rRHS.vFieldName;
+ }
+
+ return *this;
+ }
+
+}
diff --git a/src/mongo/db/pipeline/field_path.h b/src/mongo/db/pipeline/field_path.h
new file mode 100755
index 00000000000..810c5d0c7ea
--- /dev/null
+++ b/src/mongo/db/pipeline/field_path.h
@@ -0,0 +1,82 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+namespace mongo {
+
+ class FieldPath {
+ public:
+ virtual ~FieldPath();
+
+ FieldPath(const string &fieldPath);
+ FieldPath();
+
+ /*
+ Get the number of path elements in the field path.
+
+ @returns the number of path elements
+ */
+ size_t getPathLength() const;
+
+ /*
+ Get a particular path element from the path.
+
+ @param i the index of the path element
+ @returns the path element
+ */
+ string getFieldName(size_t i) const;
+
+ /*
+ Get the full path.
+
+ @param fieldPrefix whether or not to include the field prefix
+ @returns the complete field path
+ */
+ string getPath(bool fieldPrefix) const;
+
+ /*
+ Write the full path.
+
+ @param outStream where to write the path to
+ @param fieldPrefix whether or not to include the field prefix
+ */
+ void writePath(ostream &outStream, bool fieldPrefix) const;
+
+ FieldPath &operator=(const FieldPath &rRHS);
+
+ private:
+ vector<string> vFieldName;
+ };
+}
+
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+ inline size_t FieldPath::getPathLength() const {
+ return vFieldName.size();
+ }
+
+ inline string FieldPath::getFieldName(size_t i) const {
+ return vFieldName[i];
+ }
+
+}
+
diff --git a/src/mongo/db/pipeline/value.cpp b/src/mongo/db/pipeline/value.cpp
new file mode 100755
index 00000000000..b83dec359cf
--- /dev/null
+++ b/src/mongo/db/pipeline/value.cpp
@@ -0,0 +1,1034 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/value.h"
+
+#include <boost/functional/hash.hpp>
+#include "db/jsobj.h"
+#include "db/pipeline/builder.h"
+#include "db/pipeline/document.h"
+#include "util/mongoutils/str.h"
+
+namespace mongo {
+ using namespace mongoutils;
+
+ const intrusive_ptr<const Value> Value::pFieldUndefined(
+ new ValueStatic(Undefined));
+ const intrusive_ptr<const Value> Value::pFieldNull(new ValueStatic());
+ const intrusive_ptr<const Value> Value::pFieldTrue(new ValueStatic(true));
+ const intrusive_ptr<const Value> Value::pFieldFalse(new ValueStatic(false));
+ const intrusive_ptr<const Value> Value::pFieldMinusOne(new ValueStatic(-1));
+ const intrusive_ptr<const Value> Value::pFieldZero(new ValueStatic(0));
+ const intrusive_ptr<const Value> Value::pFieldOne(new ValueStatic(1));
+
+ Value::~Value() {
+ }
+
+ Value::Value():
+ type(jstNULL),
+ oidValue(),
+ dateValue(),
+ stringValue(),
+ pDocumentValue(),
+ vpValue() {
+ }
+
+ Value::Value(BSONType theType):
+ type(theType),
+ oidValue(),
+ dateValue(),
+ stringValue(),
+ pDocumentValue(),
+ vpValue() {
+ switch(type) {
+ case Undefined:
+ case jstNULL:
+ case Object: // empty
+ case Array: // empty
+ break;
+
+ case NumberDouble:
+ simple.doubleValue = 0;
+ break;
+
+ case Bool:
+ simple.boolValue = false;
+ break;
+
+ case NumberInt:
+ simple.intValue = 0;
+ break;
+
+ case Timestamp:
+ simple.timestampValue = 0;
+ break;
+
+ case NumberLong:
+ simple.longValue = 0;
+ break;
+
+ default:
+ // nothing else is allowed
+ uassert(16001, str::stream() <<
+ "can't create empty Value of type " << type, false);
+ break;
+ }
+ }
+
+ Value::Value(bool boolValue):
+ type(Bool),
+ pDocumentValue(),
+ vpValue() {
+ simple.boolValue = boolValue;
+ }
+
+ intrusive_ptr<const Value> Value::createFromBsonElement(
+ BSONElement *pBsonElement) {
+ intrusive_ptr<const Value> pValue(new Value(pBsonElement));
+ return pValue;
+ }
+
+ Value::Value(BSONElement *pBsonElement):
+ type(pBsonElement->type()),
+ pDocumentValue(),
+ vpValue() {
+ switch(type) {
+ case NumberDouble:
+ simple.doubleValue = pBsonElement->Double();
+ break;
+
+ case String:
+ stringValue = pBsonElement->String();
+ break;
+
+ case Object: {
+ BSONObj document(pBsonElement->embeddedObject());
+ pDocumentValue = Document::createFromBsonObj(&document);
+ break;
+ }
+
+ case Array: {
+ vector<BSONElement> vElement(pBsonElement->Array());
+ const size_t n = vElement.size();
+
+ vpValue.reserve(n); // save on realloc()ing
+
+ for(size_t i = 0; i < n; ++i) {
+ vpValue.push_back(
+ Value::createFromBsonElement(&vElement[i]));
+ }
+ break;
+ }
+
+ case jstOID:
+ oidValue = pBsonElement->OID();
+ break;
+
+ case Bool:
+ simple.boolValue = pBsonElement->Bool();
+ break;
+
+ case Date:
+ dateValue = pBsonElement->Date();
+ break;
+
+ case RegEx:
+ stringValue = pBsonElement->regex();
+ // TODO pBsonElement->regexFlags();
+ break;
+
+ case NumberInt:
+ simple.intValue = pBsonElement->numberInt();
+ break;
+
+ case Timestamp:
+ dateValue = pBsonElement->timestampTime();
+ break;
+
+ case NumberLong:
+ simple.longValue = pBsonElement->numberLong();
+ break;
+
+ case jstNULL:
+ break;
+
+ case BinData:
+ case Symbol:
+ case CodeWScope:
+ uassert(16002, str::stream() <<
+ "can't create Value of type " << type, false);
+ break;
+
+ /* these shouldn't happen in this context */
+ case MinKey:
+ case EOO:
+ case Undefined:
+ case DBRef:
+ case Code:
+ case MaxKey:
+ assert(false); // CW TODO better message
+ break;
+ }
+ }
+
+ Value::Value(int intValue):
+ type(NumberInt),
+ pDocumentValue(),
+ vpValue() {
+ simple.intValue = intValue;
+ }
+
+ intrusive_ptr<const Value> Value::createInt(int value) {
+ intrusive_ptr<const Value> pValue(new Value(value));
+ return pValue;
+ }
+
+ Value::Value(long long longValue):
+ type(NumberLong),
+ pDocumentValue(),
+ vpValue() {
+ simple.longValue = longValue;
+ }
+
+ intrusive_ptr<const Value> Value::createLong(long long value) {
+ intrusive_ptr<const Value> pValue(new Value(value));
+ return pValue;
+ }
+
+ Value::Value(double value):
+ type(NumberDouble),
+ pDocumentValue(),
+ vpValue() {
+ simple.doubleValue = value;
+ }
+
+ intrusive_ptr<const Value> Value::createDouble(double value) {
+ intrusive_ptr<const Value> pValue(new Value(value));
+ return pValue;
+ }
+
+ Value::Value(const Date_t &value):
+ type(Date),
+ pDocumentValue(),
+ vpValue() {
+ dateValue = value;
+ }
+
+ intrusive_ptr<const Value> Value::createDate(const Date_t &value) {
+ intrusive_ptr<const Value> pValue(new Value(value));
+ return pValue;
+ }
+
+ Value::Value(const string &value):
+ type(String),
+ pDocumentValue(),
+ vpValue() {
+ stringValue = value;
+ }
+
+ intrusive_ptr<const Value> Value::createString(const string &value) {
+ intrusive_ptr<const Value> pValue(new Value(value));
+ return pValue;
+ }
+
+ Value::Value(const intrusive_ptr<Document> &pDocument):
+ type(Object),
+ pDocumentValue(pDocument),
+ vpValue() {
+ }
+
+ intrusive_ptr<const Value> Value::createDocument(
+ const intrusive_ptr<Document> &pDocument) {
+ intrusive_ptr<const Value> pValue(new Value(pDocument));
+ return pValue;
+ }
+
+ Value::Value(const vector<intrusive_ptr<const Value> > &thevpValue):
+ type(Array),
+ pDocumentValue(),
+ vpValue(thevpValue) {
+ }
+
+ intrusive_ptr<const Value> Value::createArray(
+ const vector<intrusive_ptr<const Value> > &vpValue) {
+ intrusive_ptr<const Value> pValue(new Value(vpValue));
+ return pValue;
+ }
+
+ double Value::getDouble() const {
+ BSONType type = getType();
+ if (type == NumberInt)
+ return simple.intValue;
+ if (type == NumberLong)
+ return static_cast< double >( simple.longValue );
+
+ assert(type == NumberDouble);
+ return simple.doubleValue;
+ }
+
+ string Value::getString() const {
+ assert(getType() == String);
+ return stringValue;
+ }
+
+ intrusive_ptr<Document> Value::getDocument() const {
+ assert(getType() == Object);
+ return pDocumentValue;
+ }
+
+ ValueIterator::~ValueIterator() {
+ }
+
+ Value::vi::~vi() {
+ }
+
+ bool Value::vi::more() const {
+ return (nextIndex < size);
+ }
+
+ intrusive_ptr<const Value> Value::vi::next() {
+ assert(more());
+ return (*pvpValue)[nextIndex++];
+ }
+
+ Value::vi::vi(const intrusive_ptr<const Value> &pValue,
+ const vector<intrusive_ptr<const Value> > *thepvpValue):
+ size(thepvpValue->size()),
+ nextIndex(0),
+ pvpValue(thepvpValue) {
+ }
+
+ intrusive_ptr<ValueIterator> Value::getArray() const {
+ assert(getType() == Array);
+ intrusive_ptr<ValueIterator> pVI(
+ new vi(intrusive_ptr<const Value>(this), &vpValue));
+ return pVI;
+ }
+
+ OID Value::getOid() const {
+ assert(getType() == jstOID);
+ return oidValue;
+ }
+
+ bool Value::getBool() const {
+ assert(getType() == Bool);
+ return simple.boolValue;
+ }
+
+ Date_t Value::getDate() const {
+ assert(getType() == Date);
+ return dateValue;
+ }
+
+ string Value::getRegex() const {
+ assert(getType() == RegEx);
+ return stringValue;
+ }
+
+ string Value::getSymbol() const {
+ assert(getType() == Symbol);
+ return stringValue;
+ }
+
+ int Value::getInt() const {
+ assert(getType() == NumberInt);
+ return simple.intValue;
+ }
+
+ unsigned long long Value::getTimestamp() const {
+ assert(getType() == Timestamp);
+ return dateValue;
+ }
+
+ long long Value::getLong() const {
+ BSONType type = getType();
+ if (type == NumberInt)
+ return simple.intValue;
+
+ assert(type == NumberLong);
+ return simple.longValue;
+ }
+
+ void Value::addToBson(Builder *pBuilder) const {
+ switch(getType()) {
+ case NumberDouble:
+ pBuilder->append(getDouble());
+ break;
+
+ case String:
+ pBuilder->append(getString());
+ break;
+
+ case Object: {
+ intrusive_ptr<Document> pDocument(getDocument());
+ BSONObjBuilder subBuilder;
+ pDocument->toBson(&subBuilder);
+ subBuilder.done();
+ pBuilder->append(&subBuilder);
+ break;
+ }
+
+ case Array: {
+ const size_t n = vpValue.size();
+ BSONArrayBuilder arrayBuilder(n);
+ for(size_t i = 0; i < n; ++i) {
+ vpValue[i]->addToBsonArray(&arrayBuilder);
+ }
+
+ pBuilder->append(&arrayBuilder);
+ break;
+ }
+
+ case BinData:
+ // pBuilder->appendBinData(fieldName, ...);
+ assert(false); // CW TODO unimplemented
+ break;
+
+ case jstOID:
+ pBuilder->append(getOid());
+ break;
+
+ case Bool:
+ pBuilder->append(getBool());
+ break;
+
+ case Date:
+ pBuilder->append(getDate());
+ break;
+
+ case RegEx:
+ pBuilder->append(getRegex());
+ break;
+
+ case Symbol:
+ pBuilder->append(getSymbol());
+ break;
+
+ case CodeWScope:
+ assert(false); // CW TODO unimplemented
+ break;
+
+ case NumberInt:
+ pBuilder->append(getInt());
+ break;
+
+ case Timestamp:
+ pBuilder->append((long long)getTimestamp());
+ break;
+
+ case NumberLong:
+ pBuilder->append(getLong());
+ break;
+
+ case jstNULL:
+ pBuilder->append();
+ break;
+
+ /* these shouldn't appear in this context */
+ case MinKey:
+ case EOO:
+ case Undefined:
+ case DBRef:
+ case Code:
+ case MaxKey:
+ assert(false); // CW TODO better message
+ break;
+ }
+ }
+
+ void Value::addToBsonObj(BSONObjBuilder *pBuilder, string fieldName) const {
+ BuilderObj objBuilder(pBuilder, fieldName);
+ addToBson(&objBuilder);
+ }
+
+ void Value::addToBsonArray(BSONArrayBuilder *pBuilder) const {
+ BuilderArray arrBuilder(pBuilder);
+ addToBson(&arrBuilder);
+ }
+
+ bool Value::coerceToBool() const {
+ BSONType type = getType();
+ switch(type) {
+ case NumberDouble:
+ if (simple.doubleValue != 0)
+ return true;
+ break;
+
+ case String:
+ case Object:
+ case Array:
+ case BinData:
+ case jstOID:
+ case Date:
+ case RegEx:
+ case Symbol:
+ case Timestamp:
+ return true;
+
+ case Bool:
+ if (simple.boolValue)
+ return true;
+ break;
+
+ case CodeWScope:
+ assert(false); // CW TODO unimplemented
+ break;
+
+ case NumberInt:
+ if (simple.intValue != 0)
+ return true;
+ break;
+
+ case NumberLong:
+ if (simple.longValue != 0)
+ return true;
+ break;
+
+ case jstNULL:
+ case Undefined:
+ /* nothing to do */
+ break;
+
+ /* these shouldn't happen in this context */
+ case MinKey:
+ case EOO:
+ case DBRef:
+ case Code:
+ case MaxKey:
+ assert(false); // CW TODO better message
+ break;
+ }
+
+ return false;
+ }
+
+ intrusive_ptr<const Value> Value::coerceToBoolean() const {
+ bool result = coerceToBool();
+
+ /* always normalize to the singletons */
+ if (result)
+ return Value::getTrue();
+ return Value::getFalse();
+ }
+
+ int Value::coerceToInt() const {
+ switch(type) {
+ case NumberDouble:
+ return (int)simple.doubleValue;
+
+ case NumberInt:
+ return simple.intValue;
+
+ case NumberLong:
+ return (int)simple.longValue;
+
+ case jstNULL:
+ case Undefined:
+ break;
+
+ case String:
+ default:
+ uassert(16003, str::stream() <<
+ "can't convert from BSON type " << type <<
+ " to int",
+ false);
+ } // switch(type)
+
+ return (int)0;
+ }
+
+ long long Value::coerceToLong() const {
+ switch(type) {
+ case NumberDouble:
+ return (long long)simple.doubleValue;
+
+ case NumberInt:
+ return simple.intValue;
+
+ case NumberLong:
+ return simple.longValue;
+
+ case jstNULL:
+ case Undefined:
+ break;
+
+ case String:
+ default:
+ uassert(16004, str::stream() <<
+ "can't convert from BSON type " << type <<
+ " to long",
+ false);
+ } // switch(type)
+
+ return (long long)0;
+ }
+
+ double Value::coerceToDouble() const {
+ switch(type) {
+ case NumberDouble:
+ return simple.doubleValue;
+
+ case NumberInt:
+ return (double)simple.intValue;
+
+ case NumberLong:
+ return (double)simple.longValue;
+
+ case jstNULL:
+ case Undefined:
+ break;
+
+ case String:
+ default:
+ uassert(16005, str::stream() <<
+ "can't convert from BSON type " << type <<
+ " to double",
+ false);
+ } // switch(type)
+
+ return (double)0;
+ }
+
+ Date_t Value::coerceToDate() const {
+ switch(type) {
+
+ case Date:
+ return dateValue;
+
+ case jstNULL:
+ case Undefined:
+ break;
+
+ default:
+ uassert(16006, str::stream() <<
+ "can't convert from BSON type " << type <<
+ " to double",
+ false);
+ } // switch(type)
+
+ assert(false); // CW TODO no conversion available
+ return jstNULL;
+ }
+
+ string Value::coerceToString() const {
+ stringstream ss;
+ switch(type) {
+ case NumberDouble:
+ ss << simple.doubleValue;
+ return ss.str();
+
+ case NumberInt:
+ ss << simple.intValue;
+ return ss.str();
+
+ case NumberLong:
+ ss << simple.longValue;
+ return ss.str();
+
+ case String:
+ return stringValue;
+
+ case Date:
+ return dateValue.toString();
+
+ case jstNULL:
+ case Undefined:
+ break;
+
+ default:
+ uassert(16007, str::stream() <<
+ "can't convert from BSON type " << type <<
+ " to double",
+ false);
+ } // switch(type)
+
+ return "";
+ }
+
+ int Value::compare(const intrusive_ptr<const Value> &rL,
+ const intrusive_ptr<const Value> &rR) {
+ BSONType lType = rL->getType();
+ BSONType rType = rR->getType();
+
+ /*
+ Special handling for Undefined and NULL values; these are types,
+ so it's easier to handle them here before we go below to handle
+ values of the same types. This allows us to compare Undefined and
+ NULL values with everything else. As coded now:
+ (*) Undefined is less than everything except itself (which is equal)
+ (*) NULL is less than everything except Undefined and itself
+ */
+ if (lType == Undefined) {
+ if (rType == Undefined)
+ return 0;
+
+ /* if rType is anything else, the left value is less */
+ return -1;
+ }
+
+ if (lType == jstNULL) {
+ if (rType == Undefined)
+ return 1;
+ if (rType == jstNULL)
+ return 0;
+
+ return -1;
+ }
+
+ if ((rType == Undefined) || (rType == jstNULL)) {
+ /*
+ We know the left value isn't Undefined, because of the above.
+ Count a NULL value as greater than an undefined one.
+ */
+ return 1;
+ }
+
+ // CW TODO for now, only compare like values
+ uassert(16016, str::stream() <<
+ "can't compare values of BSON types " << lType <<
+ " and " << rType,
+ lType == rType);
+
+ switch(lType) {
+ case NumberDouble:
+ if (rL->simple.doubleValue < rR->simple.doubleValue)
+ return -1;
+ if (rL->simple.doubleValue > rR->simple.doubleValue)
+ return 1;
+ return 0;
+
+ case String:
+ return rL->stringValue.compare(rR->stringValue);
+
+ case Object:
+ return Document::compare(rL->getDocument(), rR->getDocument());
+
+ case Array: {
+ intrusive_ptr<ValueIterator> pli(rL->getArray());
+ intrusive_ptr<ValueIterator> pri(rR->getArray());
+
+ while(true) {
+ /* have we run out of left array? */
+ if (!pli->more()) {
+ if (!pri->more())
+ return 0; // the arrays are the same length
+
+ return -1; // the left array is shorter
+ }
+
+ /* have we run out of right array? */
+ if (!pri->more())
+ return 1; // the right array is shorter
+
+ /* compare the two corresponding elements */
+ intrusive_ptr<const Value> plv(pli->next());
+ intrusive_ptr<const Value> prv(pri->next());
+ const int cmp = Value::compare(plv, prv);
+ if (cmp)
+ return cmp; // values are unequal
+ }
+
+ /* NOTREACHED */
+ assert(false);
+ break;
+ }
+
+ case BinData:
+ // pBuilder->appendBinData(fieldName, ...);
+ assert(false); // CW TODO unimplemented
+ break;
+
+ case jstOID:
+ if (rL->oidValue < rR->oidValue)
+ return -1;
+ if (rL->oidValue == rR->oidValue)
+ return 0;
+ return 1;
+
+ case Bool:
+ if (rL->simple.boolValue == rR->simple.boolValue)
+ return 0;
+ if (rL->simple.boolValue)
+ return 1;
+ return -1;
+
+ case Date:
+ if (rL->dateValue < rR->dateValue)
+ return -1;
+ if (rL->dateValue > rR->dateValue)
+ return 1;
+ return 0;
+
+ case RegEx:
+ return rL->stringValue.compare(rR->stringValue);
+
+ case Symbol:
+ assert(false); // CW TODO unimplemented
+ break;
+
+ case CodeWScope:
+ assert(false); // CW TODO unimplemented
+ break;
+
+ case NumberInt:
+ if (rL->simple.intValue < rR->simple.intValue)
+ return -1;
+ if (rL->simple.intValue > rR->simple.intValue)
+ return 1;
+ return 0;
+
+ case Timestamp:
+ if (rL->dateValue < rR->dateValue)
+ return -1;
+ if (rL->dateValue > rR->dateValue)
+ return 1;
+ return 0;
+
+ case NumberLong:
+ if (rL->simple.longValue < rR->simple.longValue)
+ return -1;
+ if (rL->simple.longValue > rR->simple.longValue)
+ return 1;
+ return 0;
+
+ case Undefined:
+ case jstNULL:
+ return 0; // treat two Undefined or NULL values as equal
+
+ /* these shouldn't happen in this context */
+ case MinKey:
+ case EOO:
+ case DBRef:
+ case Code:
+ case MaxKey:
+ assert(false); // CW TODO better message
+ break;
+ } // switch(lType)
+
+ /* NOTREACHED */
+ return 0;
+ }
+
+ void Value::hash_combine(size_t &seed) const {
+ BSONType type = getType();
+ boost::hash_combine(seed, (int)type);
+
+ switch(type) {
+ case NumberDouble:
+ boost::hash_combine(seed, simple.doubleValue);
+ break;
+
+ case String:
+ boost::hash_combine(seed, stringValue);
+ break;
+
+ case Object:
+ getDocument()->hash_combine(seed);
+ break;
+
+ case Array: {
+ intrusive_ptr<ValueIterator> pIter(getArray());
+ while(pIter->more()) {
+ intrusive_ptr<const Value> pValue(pIter->next());
+ pValue->hash_combine(seed);
+ };
+ break;
+ }
+
+ case BinData:
+ // pBuilder->appendBinData(fieldName, ...);
+ assert(false); // CW TODO unimplemented
+ break;
+
+ case jstOID:
+ oidValue.hash_combine(seed);
+ break;
+
+ case Bool:
+ boost::hash_combine(seed, simple.boolValue);
+ break;
+
+ case Date:
+ boost::hash_combine(seed, (unsigned long long)dateValue);
+ break;
+
+ case RegEx:
+ boost::hash_combine(seed, stringValue);
+ break;
+
+ case Symbol:
+ assert(false); // CW TODO unimplemented
+ break;
+
+ case CodeWScope:
+ assert(false); // CW TODO unimplemented
+ break;
+
+ case NumberInt:
+ boost::hash_combine(seed, simple.intValue);
+ break;
+
+ case Timestamp:
+ boost::hash_combine(seed, (unsigned long long)dateValue);
+ break;
+
+ case NumberLong:
+ boost::hash_combine(seed, simple.longValue);
+ break;
+
+ case Undefined:
+ case jstNULL:
+ break;
+
+ /* these shouldn't happen in this context */
+ case MinKey:
+ case EOO:
+ case DBRef:
+ case Code:
+ case MaxKey:
+ assert(false); // CW TODO better message
+ break;
+ } // switch(type)
+ }
+
+ BSONType Value::getWidestNumeric(BSONType lType, BSONType rType) {
+ if (lType == NumberDouble) {
+ switch(rType) {
+ case NumberDouble:
+ case NumberLong:
+ case NumberInt:
+ case jstNULL:
+ case Undefined:
+ return NumberDouble;
+
+ default:
+ break;
+ }
+ }
+ else if (lType == NumberLong) {
+ switch(rType) {
+ case NumberDouble:
+ return NumberDouble;
+
+ case NumberLong:
+ case NumberInt:
+ case jstNULL:
+ case Undefined:
+ return NumberLong;
+
+ default:
+ break;
+ }
+ }
+ else if (lType == NumberInt) {
+ switch(rType) {
+ case NumberDouble:
+ return NumberDouble;
+
+ case NumberLong:
+ return NumberLong;
+
+ case NumberInt:
+ case jstNULL:
+ case Undefined:
+ return NumberInt;
+
+ default:
+ break;
+ }
+ }
+ else if ((lType == jstNULL) || (lType == Undefined)) {
+ switch(rType) {
+ case NumberDouble:
+ return NumberDouble;
+
+ case NumberLong:
+ return NumberLong;
+
+ case NumberInt:
+ return NumberInt;
+
+ default:
+ break;
+ }
+ }
+
+ /* NOTREACHED */
+ return Undefined;
+ }
+
+ size_t Value::getApproximateSize() const {
+ switch(type) {
+ case String:
+ return sizeof(Value) + stringValue.length();
+
+ case Object:
+ return sizeof(Value) + pDocumentValue->getApproximateSize();
+
+ case Array: {
+ size_t size = sizeof(Value);
+ const size_t n = vpValue.size();
+ for(size_t i = 0; i < n; ++i) {
+ size += vpValue[i]->getApproximateSize();
+ }
+ return size;
+ }
+
+ case NumberDouble:
+ case BinData:
+ case jstOID:
+ case Bool:
+ case Date:
+ case RegEx:
+ case Symbol:
+ case CodeWScope:
+ case NumberInt:
+ case Timestamp:
+ case NumberLong:
+ case jstNULL:
+ case Undefined:
+ return sizeof(Value);
+
+ /* these shouldn't happen in this context */
+ case MinKey:
+ case EOO:
+ case DBRef:
+ case Code:
+ case MaxKey:
+ assert(false); // CW TODO better message
+ return sizeof(Value);
+ }
+
+ /*
+ We shouldn't get here. In order to make the implementor think about
+ these cases, they are all listed explicitly, above. The compiler
+ should complain if they aren't all listed, because there's no
+ default. However, not all the compilers seem to do that. Therefore,
+ this final catch-all is here.
+ */
+ assert(false);
+ return sizeof(Value);
+ }
+
+
+ void ValueStatic::addRef() const {
+ }
+
+ void ValueStatic::release() const {
+ }
+
+}
diff --git a/src/mongo/db/pipeline/value.h b/src/mongo/db/pipeline/value.h
new file mode 100755
index 00000000000..8bd1bcbbbfd
--- /dev/null
+++ b/src/mongo/db/pipeline/value.h
@@ -0,0 +1,468 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+#include "bson/bsontypes.h"
+#include "util/intrusive_counter.h"
+
+namespace mongo {
+ class BSONElement;
+ class Builder;
+ class Document;
+ class Value;
+
+ class ValueIterator :
+ public IntrusiveCounterUnsigned {
+ public:
+ virtual ~ValueIterator();
+
+ /*
+ Ask if there are more fields to return.
+
+ @returns true if there are more fields, false otherwise
+ */
+ virtual bool more() const = 0;
+
+ /*
+ Move the iterator to point to the next field and return it.
+
+ @returns the next field's <name, Value>
+ */
+ virtual intrusive_ptr<const Value> next() = 0;
+ };
+
+
+ /*
+ Values are immutable, so these are passed around as
+ intrusive_ptr<const Value>.
+ */
+ class Value :
+ public IntrusiveCounterUnsigned {
+ public:
+ ~Value();
+
+ /*
+ Construct a Value from a BSONElement.
+
+ This ignores the name of the element, and only uses the value,
+ whatever type it is.
+
+ @returns a new Value initialized from the bsonElement
+ */
+ static intrusive_ptr<const Value> createFromBsonElement(
+ BSONElement *pBsonElement);
+
+ /*
+ Construct an integer-valued Value.
+
+ For commonly used values, consider using one of the singleton
+ instances defined below.
+
+ @param value the value
+ @returns a Value with the given value
+ */
+ static intrusive_ptr<const Value> createInt(int value);
+
+ /*
+ Construct an long(long)-valued Value.
+
+ For commonly used values, consider using one of the singleton
+ instances defined below.
+
+ @param value the value
+ @returns a Value with the given value
+ */
+ static intrusive_ptr<const Value> createLong(long long value);
+
+ /*
+ Construct a double-valued Value.
+
+ @param value the value
+ @returns a Value with the given value
+ */
+ static intrusive_ptr<const Value> createDouble(double value);
+
+ /*
+ Construct a string-valued Value.
+
+ @param value the value
+ @returns a Value with the given value
+ */
+ static intrusive_ptr<const Value> createString(const string &value);
+
+ /*
+ Construct a date-valued Value.
+
+ @param value the value
+ @returns a Value with the given value
+ */
+ static intrusive_ptr<const Value> createDate(const Date_t &value);
+
+ /*
+ Construct a document-valued Value.
+
+ @param value the value
+ @returns a Value with the given value
+ */
+ static intrusive_ptr<const Value> createDocument(
+ const intrusive_ptr<Document> &pDocument);
+
+ /*
+ Construct an array-valued Value.
+
+ @param value the value
+ @returns a Value with the given value
+ */
+ static intrusive_ptr<const Value> createArray(
+ const vector<intrusive_ptr<const Value> > &vpValue);
+
+ /*
+ Get the BSON type of the field.
+
+ If the type is jstNULL, no value getter will work.
+
+ @return the BSON type of the field.
+ */
+ BSONType getType() const;
+
+ /*
+ Getters.
+
+ @returns the Value's value; asserts if the requested value type is
+ incorrect.
+ */
+ double getDouble() const;
+ string getString() const;
+ intrusive_ptr<Document> getDocument() const;
+ intrusive_ptr<ValueIterator> getArray() const;
+ OID getOid() const;
+ bool getBool() const;
+ Date_t getDate() const;
+ string getRegex() const;
+ string getSymbol() const;
+ int getInt() const;
+ unsigned long long getTimestamp() const;
+ long long getLong() const;
+
+ /*
+ Get the length of an array value.
+
+ @returns the length of the array, if this is array-valued; otherwise
+ throws an error
+ */
+ size_t getArrayLength() const;
+
+ /*
+ Add this value to the BSON object under construction.
+ */
+ void addToBsonObj(BSONObjBuilder *pBuilder, string fieldName) const;
+
+ /*
+ Add this field to the BSON array under construction.
+
+ As part of an array, the Value's name will be ignored.
+ */
+ void addToBsonArray(BSONArrayBuilder *pBuilder) const;
+
+ /*
+ Get references to singleton instances of commonly used field values.
+ */
+ static intrusive_ptr<const Value> getUndefined();
+ static intrusive_ptr<const Value> getNull();
+ static intrusive_ptr<const Value> getTrue();
+ static intrusive_ptr<const Value> getFalse();
+ static intrusive_ptr<const Value> getMinusOne();
+ static intrusive_ptr<const Value> getZero();
+ static intrusive_ptr<const Value> getOne();
+
+ /*
+ Coerce (cast) a value to a native bool, using JSON rules.
+
+ @returns the bool value
+ */
+ bool coerceToBool() const;
+
+ /*
+ Coerce (cast) a value to a Boolean Value, using JSON rules.
+
+ @returns the Boolean Value value
+ */
+ intrusive_ptr<const Value> coerceToBoolean() const;
+
+ /*
+ Coerce (cast) a value to an int, using JSON rules.
+
+ @returns the int value
+ */
+ int coerceToInt() const;
+
+ /*
+ Coerce (cast) a value to a long long, using JSON rules.
+
+ @returns the long value
+ */
+ long long coerceToLong() const;
+
+ /*
+ Coerce (cast) a value to a double, using JSON rules.
+
+ @returns the double value
+ */
+ double coerceToDouble() const;
+
+ /*
+ Coerce (cast) a value to a date, using JSON rules.
+
+ @returns the date value
+ */
+ Date_t coerceToDate() const;
+
+ /*
+ Coerce (cast) a value to a string, using JSON rules.
+
+ @returns the date value
+ */
+ string coerceToString() const;
+
+ /*
+ Compare two Values.
+
+ @param rL left value
+ @param rR right value
+ @returns an integer less than zero, zero, or an integer greater than
+ zero, depending on whether rL < rR, rL == rR, or rL > rR
+ */
+ static int compare(const intrusive_ptr<const Value> &rL,
+ const intrusive_ptr<const Value> &rR);
+
+
+ /*
+ Figure out what the widest of two numeric types is.
+
+ Widest can be thought of as "most capable," or "able to hold the
+ largest or most precise value." The progression is Int, Long, Double.
+
+ @param rL left value
+ @param rR right value
+ @returns a BSONType of NumberInt, NumberLong, or NumberDouble
+ */
+ static BSONType getWidestNumeric(BSONType lType, BSONType rType);
+
+ /*
+ Get the approximate storage size of the value, in bytes.
+
+ @returns approximate storage size of the value.
+ */
+ size_t getApproximateSize() const;
+
+ /*
+ Calculate a hash value.
+
+ Meant to be used to create composite hashes suitable for
+ boost classes such as unordered_map<>.
+
+ @param seed value to augment with this' hash
+ */
+ void hash_combine(size_t &seed) const;
+
+ /*
+ struct Hash is defined to enable the use of Values as
+ keys in boost::unordered_map<>.
+
+ Values are always referenced as immutables in the form
+ intrusive_ptr<const Value>, so these operate on that construction.
+ */
+ struct Hash :
+ unary_function<intrusive_ptr<const Value>, size_t> {
+ size_t operator()(const intrusive_ptr<const Value> &rV) const;
+ };
+
+ protected:
+ Value(); // creates null value
+ Value(BSONType type); // creates an empty (unitialized value) of type
+ // mostly useful for Undefined
+ Value(bool boolValue);
+ Value(int intValue);
+
+ private:
+ Value(BSONElement *pBsonElement);
+
+ Value(long long longValue);
+ Value(double doubleValue);
+ Value(const Date_t &dateValue);
+ Value(const string &stringValue);
+ Value(const intrusive_ptr<Document> &pDocument);
+ Value(const vector<intrusive_ptr<const Value> > &vpValue);
+
+ void addToBson(Builder *pBuilder) const;
+
+ BSONType type;
+
+ /* store value in one of these */
+ union {
+ double doubleValue;
+ bool boolValue;
+ int intValue;
+ unsigned long long timestampValue;
+ long long longValue;
+
+ } simple; // values that don't need a ctor/dtor
+ OID oidValue;
+ Date_t dateValue;
+ string stringValue; // String, Regex, Symbol
+ intrusive_ptr<Document> pDocumentValue;
+ vector<intrusive_ptr<const Value> > vpValue; // for arrays
+
+
+ /*
+ These are often used as the result of boolean or comparison
+ expressions.
+
+ These are obtained via public static getters defined above.
+ */
+ static const intrusive_ptr<const Value> pFieldUndefined;
+ static const intrusive_ptr<const Value> pFieldNull;
+ static const intrusive_ptr<const Value> pFieldTrue;
+ static const intrusive_ptr<const Value> pFieldFalse;
+ static const intrusive_ptr<const Value> pFieldMinusOne;
+ static const intrusive_ptr<const Value> pFieldZero;
+ static const intrusive_ptr<const Value> pFieldOne;
+
+ /* this implementation is used for getArray() */
+ class vi :
+ public ValueIterator {
+ public:
+ // virtuals from ValueIterator
+ virtual ~vi();
+ virtual bool more() const;
+ virtual intrusive_ptr<const Value> next();
+
+ private:
+ friend class Value;
+ vi(const intrusive_ptr<const Value> &pSource,
+ const vector<intrusive_ptr<const Value> > *pvpValue);
+
+ size_t size;
+ size_t nextIndex;
+ const vector<intrusive_ptr<const Value> > *pvpValue;
+ }; /* class vi */
+
+ };
+
+ /*
+ Equality operator for values.
+
+ Useful for unordered_map<>, etc.
+ */
+ inline bool operator==(const intrusive_ptr<const Value> &v1,
+ const intrusive_ptr<const Value> &v2) {
+ return (Value::compare(v1, v2) == 0);
+ }
+
+ /*
+ For performance reasons, there are various sharable static values
+ defined in class Value, obtainable by methods such as getUndefined(),
+ getTrue(), getOne(), etc. We don't want these to go away as they are
+ used by a multitude of threads evaluating pipelines. In order to avoid
+ having to use atomic integers in the intrusive reference counter, this
+ class overrides the reference counting methods to do nothing, making it
+ safe to use for static Values.
+
+ At this point, only the constructors necessary for the static Values in
+ common use have been defined. The remainder can be defined if necessary.
+ */
+ class ValueStatic :
+ public Value {
+ public:
+ // virtuals from IntrusiveCounterUnsigned
+ virtual void addRef() const;
+ virtual void release() const;
+
+ // constructors
+ ValueStatic();
+ ValueStatic(BSONType type);
+ ValueStatic(bool boolValue);
+ ValueStatic(int intValue);
+ };
+}
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+ inline BSONType Value::getType() const {
+ return type;
+ }
+
+ inline size_t Value::getArrayLength() const {
+ assert(getType() == Array);
+ return vpValue.size();
+ }
+
+ inline intrusive_ptr<const Value> Value::getUndefined() {
+ return pFieldUndefined;
+ }
+
+ inline intrusive_ptr<const Value> Value::getNull() {
+ return pFieldNull;
+ }
+
+ inline intrusive_ptr<const Value> Value::getTrue() {
+ return pFieldTrue;
+ }
+
+ inline intrusive_ptr<const Value> Value::getFalse() {
+ return pFieldFalse;
+ }
+
+ inline intrusive_ptr<const Value> Value::getMinusOne() {
+ return pFieldMinusOne;
+ }
+
+ inline intrusive_ptr<const Value> Value::getZero() {
+ return pFieldZero;
+ }
+
+ inline intrusive_ptr<const Value> Value::getOne() {
+ return pFieldOne;
+ }
+
+ inline size_t Value::Hash::operator()(
+ const intrusive_ptr<const Value> &rV) const {
+ size_t seed = 0xf0afbeef;
+ rV->hash_combine(seed);
+ return seed;
+ }
+
+ inline ValueStatic::ValueStatic():
+ Value() {
+ }
+
+ inline ValueStatic::ValueStatic(BSONType type):
+ Value(type) {
+ }
+
+ inline ValueStatic::ValueStatic(bool boolValue):
+ Value(boolValue) {
+ }
+
+ inline ValueStatic::ValueStatic(int intValue):
+ Value(intValue) {
+ }
+
+};
diff --git a/src/mongo/db/projection.cpp b/src/mongo/db/projection.cpp
new file mode 100644
index 00000000000..d07e56527af
--- /dev/null
+++ b/src/mongo/db/projection.cpp
@@ -0,0 +1,301 @@
+// projection.cpp
+
+/* Copyright 2009 10gen Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pch.h"
+#include "projection.h"
+#include "../util/mongoutils/str.h"
+
+namespace mongo {
+
+ void Projection::init( const BSONObj& o ) {
+ massert( 10371 , "can only add to Projection once", _source.isEmpty());
+ _source = o;
+
+ BSONObjIterator i( o );
+ int true_false = -1;
+ while ( i.more() ) {
+ BSONElement e = i.next();
+
+ if ( ! e.isNumber() )
+ _hasNonSimple = true;
+
+ if (e.type() == Object) {
+ BSONObj obj = e.embeddedObject();
+ BSONElement e2 = obj.firstElement();
+ if ( strcmp(e2.fieldName(), "$slice") == 0 ) {
+ if (e2.isNumber()) {
+ int i = e2.numberInt();
+ if (i < 0)
+ add(e.fieldName(), i, -i); // limit is now positive
+ else
+ add(e.fieldName(), 0, i);
+
+ }
+ else if (e2.type() == Array) {
+ BSONObj arr = e2.embeddedObject();
+ uassert(13099, "$slice array wrong size", arr.nFields() == 2 );
+
+ BSONObjIterator it(arr);
+ int skip = it.next().numberInt();
+ int limit = it.next().numberInt();
+ uassert(13100, "$slice limit must be positive", limit > 0 );
+ add(e.fieldName(), skip, limit);
+
+ }
+ else {
+ uassert(13098, "$slice only supports numbers and [skip, limit] arrays", false);
+ }
+ }
+ else {
+ uassert(13097, string("Unsupported projection option: ") + obj.firstElementFieldName(), false);
+ }
+
+ }
+ else if (!strcmp(e.fieldName(), "_id") && !e.trueValue()) {
+ _includeID = false;
+
+ }
+ else {
+
+ add (e.fieldName(), e.trueValue());
+
+ // validate input
+ if (true_false == -1) {
+ true_false = e.trueValue();
+ _include = !e.trueValue();
+ }
+ else {
+ uassert( 10053 , "You cannot currently mix including and excluding fields. Contact us if this is an issue." ,
+ (bool)true_false == e.trueValue() );
+ }
+ }
+ }
+ }
+
+ void Projection::add(const string& field, bool include) {
+ if (field.empty()) { // this is the field the user referred to
+ _include = include;
+ }
+ else {
+ _include = !include;
+
+ const size_t dot = field.find('.');
+ const string subfield = field.substr(0,dot);
+ const string rest = (dot == string::npos ? "" : field.substr(dot+1,string::npos));
+
+ boost::shared_ptr<Projection>& fm = _fields[subfield];
+ if (!fm)
+ fm.reset(new Projection());
+
+ fm->add(rest, include);
+ }
+ }
+
+ void Projection::add(const string& field, int skip, int limit) {
+ _special = true; // can't include or exclude whole object
+
+ if (field.empty()) { // this is the field the user referred to
+ _skip = skip;
+ _limit = limit;
+ }
+ else {
+ const size_t dot = field.find('.');
+ const string subfield = field.substr(0,dot);
+ const string rest = (dot == string::npos ? "" : field.substr(dot+1,string::npos));
+
+ boost::shared_ptr<Projection>& fm = _fields[subfield];
+ if (!fm)
+ fm.reset(new Projection());
+
+ fm->add(rest, skip, limit);
+ }
+ }
+
+ void Projection::transform( const BSONObj& in , BSONObjBuilder& b ) const {
+ BSONObjIterator i(in);
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ if ( mongoutils::str::equals( "_id" , e.fieldName() ) ) {
+ if ( _includeID )
+ b.append( e );
+ }
+ else {
+ append( b , e );
+ }
+ }
+ }
+
+ BSONObj Projection::transform( const BSONObj& in ) const {
+ BSONObjBuilder b;
+ transform( in , b );
+ return b.obj();
+ }
+
+
+ //b will be the value part of an array-typed BSONElement
+ void Projection::appendArray( BSONObjBuilder& b , const BSONObj& a , bool nested) const {
+ int skip = nested ? 0 : _skip;
+ int limit = nested ? -1 : _limit;
+
+ if (skip < 0) {
+ skip = max(0, skip + a.nFields());
+ }
+
+ int i=0;
+ BSONObjIterator it(a);
+ while (it.more()) {
+ BSONElement e = it.next();
+
+ if (skip) {
+ skip--;
+ continue;
+ }
+
+ if (limit != -1 && (limit-- == 0)) {
+ break;
+ }
+
+ switch(e.type()) {
+ case Array: {
+ BSONObjBuilder subb;
+ appendArray(subb , e.embeddedObject(), true);
+ b.appendArray(b.numStr(i++), subb.obj());
+ break;
+ }
+ case Object: {
+ BSONObjBuilder subb;
+ BSONObjIterator jt(e.embeddedObject());
+ while (jt.more()) {
+ append(subb , jt.next());
+ }
+ b.append(b.numStr(i++), subb.obj());
+ break;
+ }
+ default:
+ if (_include)
+ b.appendAs(e, b.numStr(i++));
+ }
+ }
+ }
+
+ void Projection::append( BSONObjBuilder& b , const BSONElement& e ) const {
+ FieldMap::const_iterator field = _fields.find( e.fieldName() );
+
+ if (field == _fields.end()) {
+ if (_include)
+ b.append(e);
+ }
+ else {
+ Projection& subfm = *field->second;
+
+ if ((subfm._fields.empty() && !subfm._special) || !(e.type()==Object || e.type()==Array) ) {
+ if (subfm._include)
+ b.append(e);
+ }
+ else if (e.type() == Object) {
+ BSONObjBuilder subb;
+ BSONObjIterator it(e.embeddedObject());
+ while (it.more()) {
+ subfm.append(subb, it.next());
+ }
+ b.append(e.fieldName(), subb.obj());
+
+ }
+ else { //Array
+ BSONObjBuilder subb;
+ subfm.appendArray(subb, e.embeddedObject());
+ b.appendArray(e.fieldName(), subb.obj());
+ }
+ }
+ }
+
+ Projection::KeyOnly* Projection::checkKey( const BSONObj& keyPattern ) const {
+ if ( _include ) {
+ // if we default to including then we can't
+ // use an index because we don't know what we're missing
+ return 0;
+ }
+
+ if ( _hasNonSimple )
+ return 0;
+
+ if ( _includeID && keyPattern["_id"].eoo() )
+ return 0;
+
+ // at this point we know its all { x : 1 } style
+
+ auto_ptr<KeyOnly> p( new KeyOnly() );
+
+ int got = 0;
+ BSONObjIterator i( keyPattern );
+ while ( i.more() ) {
+ BSONElement k = i.next();
+
+ if ( _source[k.fieldName()].type() ) {
+
+ if ( strchr( k.fieldName() , '.' ) ) {
+ // TODO we currently don't support dotted fields
+ // SERVER-2104
+ return 0;
+ }
+
+ if ( ! _includeID && mongoutils::str::equals( k.fieldName() , "_id" ) ) {
+ p->addNo();
+ }
+ else {
+ p->addYes( k.fieldName() );
+ got++;
+ }
+ }
+ else if ( mongoutils::str::equals( "_id" , k.fieldName() ) && _includeID ) {
+ p->addYes( "_id" );
+ }
+ else {
+ p->addNo();
+ }
+
+ }
+
+ int need = _source.nFields();
+ if ( ! _includeID )
+ need--;
+
+ if ( got == need )
+ return p.release();
+
+ return 0;
+ }
+
+ BSONObj Projection::KeyOnly::hydrate( const BSONObj& key ) const {
+ assert( _include.size() == _names.size() );
+
+ BSONObjBuilder b( key.objsize() + _stringSize + 16 );
+
+ BSONObjIterator i(key);
+ unsigned n=0;
+ while ( i.more() ) {
+ assert( n < _include.size() );
+ BSONElement e = i.next();
+ if ( _include[n] ) {
+ b.appendAs( e , _names[n] );
+ }
+ n++;
+ }
+
+ return b.obj();
+ }
+}
diff --git a/src/mongo/db/projection.h b/src/mongo/db/projection.h
new file mode 100644
index 00000000000..b5e0a0c4289
--- /dev/null
+++ b/src/mongo/db/projection.h
@@ -0,0 +1,129 @@
+// projection.h
+
+/* Copyright 2009 10gen Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "pch.h"
+#include "jsobj.h"
+
+namespace mongo {
+
+ /**
+ * given a document and a projection specification
+ * can transform the document
+ * currently supports specifying which fields and $slice
+ */
+ class Projection {
+ public:
+
+ class KeyOnly {
+ public:
+
+ KeyOnly() : _stringSize(0) {}
+
+ BSONObj hydrate( const BSONObj& key ) const;
+
+ void addNo() { _add( false , "" ); }
+ void addYes( const string& name ) { _add( true , name ); }
+
+ private:
+
+ void _add( bool b , const string& name ) {
+ _include.push_back( b );
+ _names.push_back( name );
+ _stringSize += name.size();
+ }
+
+ vector<bool> _include; // one entry per field in key. true iff should be in output
+ vector<string> _names; // name of field since key doesn't have names
+
+ int _stringSize;
+ };
+
+ Projection() :
+ _include(true) ,
+ _special(false) ,
+ _includeID(true) ,
+ _skip(0) ,
+ _limit(-1) ,
+ _hasNonSimple(false) {
+ }
+
+ /**
+ * called once per lifetime
+ * e.g. { "x" : 1 , "a.y" : 1 }
+ */
+ void init( const BSONObj& spec );
+
+ /**
+ * @return the spec init was called with
+ */
+ BSONObj getSpec() const { return _source; }
+
+ /**
+ * transforms in according to spec
+ */
+ BSONObj transform( const BSONObj& in ) const;
+
+
+ /**
+ * transforms in according to spec
+ */
+ void transform( const BSONObj& in , BSONObjBuilder& b ) const;
+
+
+ /**
+ * @return if the keyPattern has all the information needed to return then
+ * return a new KeyOnly otherwise null
+ * NOTE: a key may have modified the actual data
+ * which has to be handled above this (arrays, geo)
+ */
+ KeyOnly* checkKey( const BSONObj& keyPattern ) const;
+
+ bool includeID() const { return _includeID; }
+
+ private:
+
+ /**
+ * appends e to b if user wants it
+ * will descend into e if needed
+ */
+ void append( BSONObjBuilder& b , const BSONElement& e ) const;
+
+
+ void add( const string& field, bool include );
+ void add( const string& field, int skip, int limit );
+ void appendArray( BSONObjBuilder& b , const BSONObj& a , bool nested=false) const;
+
+ bool _include; // true if default at this level is to include
+ bool _special; // true if this level can't be skipped or included without recursing
+
+ //TODO: benchmark vector<pair> vs map
+ typedef map<string, boost::shared_ptr<Projection> > FieldMap;
+ FieldMap _fields;
+ BSONObj _source;
+ bool _includeID;
+
+ // used for $slice operator
+ int _skip;
+ int _limit;
+
+ bool _hasNonSimple;
+ };
+
+
+}
diff --git a/src/mongo/db/queryoptimizer.cpp b/src/mongo/db/queryoptimizer.cpp
new file mode 100644
index 00000000000..9d9040d51e2
--- /dev/null
+++ b/src/mongo/db/queryoptimizer.cpp
@@ -0,0 +1,1337 @@
+// @file queryoptimizer.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db.h"
+#include "btree.h"
+#include "pdfile.h"
+#include "queryoptimizer.h"
+#include "cmdline.h"
+#include "clientcursor.h"
+
+//#define DEBUGQO(x) cout << x << endl;
+#define DEBUGQO(x)
+
+namespace mongo {
+
+ void checkTableScanAllowed( const char * ns ) {
+ if ( ! cmdLine.noTableScan )
+ return;
+
+ if ( strstr( ns , ".system." ) ||
+ strstr( ns , "local." ) )
+ return;
+
+ if ( ! nsdetails( ns ) )
+ return;
+
+ uassert( 10111 , (string)"table scans not allowed:" + ns , ! cmdLine.noTableScan );
+ }
+
+ double elementDirection( const BSONElement &e ) {
+ if ( e.isNumber() )
+ return e.number();
+ return 1;
+ }
+
+ QueryPlan::QueryPlan(
+ NamespaceDetails *d, int idxNo,
+ const FieldRangeSetPair &frsp, const FieldRangeSetPair *originalFrsp, const BSONObj &originalQuery, const BSONObj &order, bool mustAssertOnYieldFailure, const BSONObj &startKey, const BSONObj &endKey , string special ) :
+ _d(d), _idxNo(idxNo),
+ _frs( frsp.frsForIndex( _d, _idxNo ) ),
+ _frsMulti( frsp.frsForIndex( _d, -1 ) ),
+ _originalQuery( originalQuery ),
+ _order( order ),
+ _index( 0 ),
+ _optimal( false ),
+ _scanAndOrderRequired( true ),
+ _exactKeyMatch( false ),
+ _direction( 0 ),
+ _endKeyInclusive( endKey.isEmpty() ),
+ _unhelpful( false ),
+ _impossible( false ),
+ _special( special ),
+ _type(0),
+ _startOrEndSpec( !startKey.isEmpty() || !endKey.isEmpty() ),
+ _mustAssertOnYieldFailure( mustAssertOnYieldFailure ) {
+
+ BSONObj idxKey = _idxNo < 0 ? BSONObj() : d->idx( _idxNo ).keyPattern();
+
+ if ( !_frs.matchPossibleForIndex( idxKey ) ) {
+ _impossible = true;
+ _scanAndOrderRequired = false;
+ return;
+ }
+
+ if ( willScanTable() ) {
+ if ( _order.isEmpty() || !strcmp( _order.firstElementFieldName(), "$natural" ) )
+ _scanAndOrderRequired = false;
+ return;
+ }
+
+ _index = &d->idx(_idxNo);
+
+ // If the parsing or index indicates this is a special query, don't continue the processing
+ if ( _special.size() ||
+ ( _index->getSpec().getType() && _index->getSpec().getType()->suitability( originalQuery, order ) != USELESS ) ) {
+
+ if( _special.size() ) _optimal = true;
+
+ _type = _index->getSpec().getType();
+ if( !_special.size() ) _special = _index->getSpec().getType()->getPlugin()->getName();
+
+ massert( 13040 , (string)"no type for special: " + _special , _type );
+ // hopefully safe to use original query in these contexts - don't think we can mix special with $or clause separation yet
+ _scanAndOrderRequired = _type->scanAndOrderRequired( _originalQuery , order );
+ return;
+ }
+
+ const IndexSpec &idxSpec = _index->getSpec();
+ BSONObjIterator o( order );
+ BSONObjIterator k( idxKey );
+ if ( !o.moreWithEOO() )
+ _scanAndOrderRequired = false;
+ while( o.moreWithEOO() ) {
+ BSONElement oe = o.next();
+ if ( oe.eoo() ) {
+ _scanAndOrderRequired = false;
+ break;
+ }
+ if ( !k.moreWithEOO() )
+ break;
+ BSONElement ke;
+ while( 1 ) {
+ ke = k.next();
+ if ( ke.eoo() )
+ goto doneCheckOrder;
+ if ( strcmp( oe.fieldName(), ke.fieldName() ) == 0 )
+ break;
+ if ( !_frs.range( ke.fieldName() ).equality() )
+ goto doneCheckOrder;
+ }
+ int d = elementDirection( oe ) == elementDirection( ke ) ? 1 : -1;
+ if ( _direction == 0 )
+ _direction = d;
+ else if ( _direction != d )
+ break;
+ }
+doneCheckOrder:
+ if ( _scanAndOrderRequired )
+ _direction = 0;
+ BSONObjIterator i( idxKey );
+ int exactIndexedQueryCount = 0;
+ int optimalIndexedQueryCount = 0;
+ bool stillOptimalIndexedQueryCount = true;
+ set<string> orderFieldsUnindexed;
+ order.getFieldNames( orderFieldsUnindexed );
+ while( i.moreWithEOO() ) {
+ BSONElement e = i.next();
+ if ( e.eoo() )
+ break;
+ const FieldRange &fr = _frs.range( e.fieldName() );
+ if ( stillOptimalIndexedQueryCount ) {
+ if ( fr.nontrivial() )
+ ++optimalIndexedQueryCount;
+ if ( !fr.equality() )
+ stillOptimalIndexedQueryCount = false;
+ }
+ else {
+ if ( fr.nontrivial() )
+ optimalIndexedQueryCount = -1;
+ }
+ if ( fr.equality() ) {
+ BSONElement e = fr.max();
+ if ( !e.isNumber() && !e.mayEncapsulate() && e.type() != RegEx )
+ ++exactIndexedQueryCount;
+ }
+ orderFieldsUnindexed.erase( e.fieldName() );
+ }
+ if ( !_scanAndOrderRequired &&
+ ( optimalIndexedQueryCount == _frs.nNontrivialRanges() ) )
+ _optimal = true;
+ if ( exactIndexedQueryCount == _frs.nNontrivialRanges() &&
+ orderFieldsUnindexed.size() == 0 &&
+ exactIndexedQueryCount == idxKey.nFields() &&
+ exactIndexedQueryCount == _originalQuery.nFields() ) {
+ _exactKeyMatch = true;
+ }
+ _frv.reset( new FieldRangeVector( _frs, idxSpec, _direction ) );
+ if ( originalFrsp ) {
+ _originalFrv.reset( new FieldRangeVector( originalFrsp->frsForIndex( _d, _idxNo ), idxSpec, _direction ) );
+ }
+ else {
+ _originalFrv = _frv;
+ }
+ if ( _startOrEndSpec ) {
+ BSONObj newStart, newEnd;
+ if ( !startKey.isEmpty() )
+ _startKey = startKey;
+ else
+ _startKey = _frv->startKey();
+ if ( !endKey.isEmpty() )
+ _endKey = endKey;
+ else
+ _endKey = _frv->endKey();
+ }
+
+ if ( ( _scanAndOrderRequired || _order.isEmpty() ) &&
+ !_frs.range( idxKey.firstElementFieldName() ).nontrivial() ) {
+ _unhelpful = true;
+ }
+ }
+
+ shared_ptr<Cursor> QueryPlan::newCursor( const DiskLoc &startLoc , int numWanted ) const {
+
+ if ( _type ) {
+ // hopefully safe to use original query in these contexts - don't think we can mix type with $or clause separation yet
+ return _type->newCursor( _originalQuery , _order , numWanted );
+ }
+
+ if ( _impossible ) {
+ // TODO We might want to allow this dummy table scan even in no table
+ // scan mode, since it won't scan anything.
+ if ( _frs.nNontrivialRanges() )
+ checkTableScanAllowed( _frs.ns() );
+ return shared_ptr<Cursor>( new BasicCursor( DiskLoc() ) );
+ }
+
+ if ( willScanTable() ) {
+ if ( _frs.nNontrivialRanges() ) {
+ checkTableScanAllowed( _frs.ns() );
+
+ // if we are doing a table scan on _id
+ // and it's a capped collection
+ // we warn /*disallow*/ as it's a common user error
+ // .system. and local collections are exempt
+ if ( _d && _d->capped && _frs.range( "_id" ).nontrivial() ) {
+ if ( cc().isSyncThread() ||
+ str::contains( _frs.ns() , ".system." ) ||
+ str::startsWith( _frs.ns() , "local." ) ) {
+ // ok
+ }
+ else {
+ warning() << "_id query on capped collection without an _id index, performance will be poor collection: " << _frs.ns() << endl;
+ //uassert( 14820, str::stream() << "doing _id query on a capped collection without an index is not allowed: " << _frs.ns() ,
+ }
+ }
+ }
+ return findTableScan( _frs.ns(), _order, startLoc );
+ }
+
+ massert( 10363 , "newCursor() with start location not implemented for indexed plans", startLoc.isNull() );
+
+ if ( _startOrEndSpec ) {
+ // we are sure to spec _endKeyInclusive
+ return shared_ptr<Cursor>( BtreeCursor::make( _d, _idxNo, *_index, _startKey, _endKey, _endKeyInclusive, _direction >= 0 ? 1 : -1 ) );
+ }
+ else if ( _index->getSpec().getType() ) {
+ return shared_ptr<Cursor>( BtreeCursor::make( _d, _idxNo, *_index, _frv->startKey(), _frv->endKey(), true, _direction >= 0 ? 1 : -1 ) );
+ }
+ else {
+ return shared_ptr<Cursor>( BtreeCursor::make( _d, _idxNo, *_index, _frv, _direction >= 0 ? 1 : -1 ) );
+ }
+ }
+
+ shared_ptr<Cursor> QueryPlan::newReverseCursor() const {
+ if ( willScanTable() ) {
+ int orderSpec = _order.getIntField( "$natural" );
+ if ( orderSpec == INT_MIN )
+ orderSpec = 1;
+ return findTableScan( _frs.ns(), BSON( "$natural" << -orderSpec ) );
+ }
+ massert( 10364 , "newReverseCursor() not implemented for indexed plans", false );
+ return shared_ptr<Cursor>();
+ }
+
+ BSONObj QueryPlan::indexKey() const {
+ if ( !_index )
+ return BSON( "$natural" << 1 );
+ return _index->keyPattern();
+ }
+
+ void QueryPlan::registerSelf( long long nScanned ) const {
+ // Impossible query constraints can be detected before scanning, and we
+ // don't have a reserved pattern enum value for impossible constraints.
+ if ( _impossible ) {
+ return;
+ }
+
+ SimpleMutex::scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
+ NamespaceDetailsTransient::get_inlock( ns() ).registerIndexForPattern( _frs.pattern( _order ), indexKey(), nScanned );
+ }
+
+ /**
+ * @return a copy of the inheriting class, which will be run with its own
+ * query plan. If multiple plan sets are required for an $or query, the
+ * QueryOp of the winning plan from a given set will be cloned to generate
+ * QueryOps for the subsequent plan set. This function should only be called
+ * after the query op has completed executing.
+ */
+ QueryOp *QueryOp::createChild() {
+ if( _orConstraint.get() ) {
+ _matcher->advanceOrClause( _orConstraint );
+ _orConstraint.reset();
+ }
+ QueryOp *ret = _createChild();
+ ret->_oldMatcher = _matcher;
+ return ret;
+ }
+
+ bool QueryPlan::isMultiKey() const {
+ if ( _idxNo < 0 )
+ return false;
+ return _d->isMultikey( _idxNo );
+ }
+
+ void QueryOp::init() {
+ if ( _oldMatcher.get() ) {
+ _matcher.reset( _oldMatcher->nextClauseMatcher( qp().indexKey() ) );
+ }
+ else {
+ _matcher.reset( new CoveredIndexMatcher( qp().originalQuery(), qp().indexKey(), alwaysUseRecord() ) );
+ }
+ _init();
+ }
+
+ QueryPlanSet::QueryPlanSet( const char *ns, auto_ptr<FieldRangeSetPair> frsp, auto_ptr<FieldRangeSetPair> originalFrsp, const BSONObj &originalQuery, const BSONObj &order, bool mustAssertOnYieldFailure, const BSONElement *hint, bool honorRecordedPlan, const BSONObj &min, const BSONObj &max, bool bestGuessOnly, bool mayYield ) :
+ _ns(ns),
+ _originalQuery( originalQuery ),
+ _frsp( frsp ),
+ _originalFrsp( originalFrsp ),
+ _mayRecordPlan( false ),
+ _usingCachedPlan( false ),
+ _hint( BSONObj() ),
+ _order( order.getOwned() ),
+ _oldNScanned( 0 ),
+ _honorRecordedPlan( honorRecordedPlan ),
+ _min( min.getOwned() ),
+ _max( max.getOwned() ),
+ _bestGuessOnly( bestGuessOnly ),
+ _mayYield( mayYield ),
+ _yieldSometimesTracker( 256, 20 ),
+ _mustAssertOnYieldFailure( mustAssertOnYieldFailure ) {
+ if ( hint && !hint->eoo() ) {
+ _hint = hint->wrap();
+ }
+ init();
+ }
+
+ bool QueryPlanSet::modifiedKeys() const {
+ for( PlanSet::const_iterator i = _plans.begin(); i != _plans.end(); ++i )
+ if ( (*i)->isMultiKey() )
+ return true;
+ return false;
+ }
+
+ bool QueryPlanSet::hasMultiKey() const {
+ for( PlanSet::const_iterator i = _plans.begin(); i != _plans.end(); ++i )
+ if ( (*i)->isMultiKey() )
+ return true;
+ return false;
+ }
+
+
+ void QueryPlanSet::addHint( IndexDetails &id ) {
+ if ( !_min.isEmpty() || !_max.isEmpty() ) {
+ string errmsg;
+ BSONObj keyPattern = id.keyPattern();
+ // This reformats _min and _max to be used for index lookup.
+ massert( 10365 , errmsg, indexDetailsForRange( _frsp->ns(), errmsg, _min, _max, keyPattern ) );
+ }
+ NamespaceDetails *d = nsdetails(_ns);
+ _plans.push_back( QueryPlanPtr( new QueryPlan( d, d->idxNo(id), *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure, _min, _max ) ) );
+ }
+
+ // returns an IndexDetails * for a hint, 0 if hint is $natural.
+ // hint must not be eoo()
+ IndexDetails *parseHint( const BSONElement &hint, NamespaceDetails *d ) {
+ massert( 13292, "hint eoo", !hint.eoo() );
+ if( hint.type() == String ) {
+ string hintstr = hint.valuestr();
+ NamespaceDetails::IndexIterator i = d->ii();
+ while( i.more() ) {
+ IndexDetails& ii = i.next();
+ if ( ii.indexName() == hintstr ) {
+ return &ii;
+ }
+ }
+ }
+ else if( hint.type() == Object ) {
+ BSONObj hintobj = hint.embeddedObject();
+ uassert( 10112 , "bad hint", !hintobj.isEmpty() );
+ if ( !strcmp( hintobj.firstElementFieldName(), "$natural" ) ) {
+ return 0;
+ }
+ NamespaceDetails::IndexIterator i = d->ii();
+ while( i.more() ) {
+ IndexDetails& ii = i.next();
+ if( ii.keyPattern().woCompare(hintobj) == 0 ) {
+ return &ii;
+ }
+ }
+ }
+ uassert( 10113 , "bad hint", false );
+ return 0;
+ }
+
+ void QueryPlanSet::init() {
+ DEBUGQO( "QueryPlanSet::init " << ns << "\t" << _originalQuery );
+ _runner.reset();
+ _plans.clear();
+ _usingCachedPlan = false;
+
+ const char *ns = _frsp->ns();
+ NamespaceDetails *d = nsdetails( ns );
+ if ( !d || !_frsp->matchPossible() ) {
+ // Table scan plan, when no matches are possible
+ _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ) );
+ return;
+ }
+
+ BSONElement hint = _hint.firstElement();
+ if ( !hint.eoo() ) {
+ IndexDetails *id = parseHint( hint, d );
+ if ( id ) {
+ addHint( *id );
+ }
+ else {
+ massert( 10366 , "natural order cannot be specified with $min/$max", _min.isEmpty() && _max.isEmpty() );
+ // Table scan plan
+ _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ) );
+ }
+ return;
+ }
+
+ if ( !_min.isEmpty() || !_max.isEmpty() ) {
+ string errmsg;
+ BSONObj keyPattern;
+ IndexDetails *idx = indexDetailsForRange( ns, errmsg, _min, _max, keyPattern );
+ massert( 10367 , errmsg, idx );
+ _plans.push_back( QueryPlanPtr( new QueryPlan( d, d->idxNo(*idx), *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure, _min, _max ) ) );
+ return;
+ }
+
+ if ( isSimpleIdQuery( _originalQuery ) ) {
+ int idx = d->findIdIndex();
+ if ( idx >= 0 ) {
+ _plans.push_back( QueryPlanPtr( new QueryPlan( d , idx , *_frsp , _originalFrsp.get() , _originalQuery, _order, _mustAssertOnYieldFailure ) ) );
+ return;
+ }
+ }
+
+ if ( _originalQuery.isEmpty() && _order.isEmpty() ) {
+ _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ) );
+ return;
+ }
+
+ DEBUGQO( "\t special : " << _frsp->getSpecial() );
+ if ( _frsp->getSpecial().size() ) {
+ _special = _frsp->getSpecial();
+ NamespaceDetails::IndexIterator i = d->ii();
+ while( i.more() ) {
+ int j = i.pos();
+ IndexDetails& ii = i.next();
+ const IndexSpec& spec = ii.getSpec();
+ if ( spec.getTypeName() == _special && spec.suitability( _originalQuery , _order ) ) {
+ _plans.push_back( QueryPlanPtr( new QueryPlan( d , j , *_frsp , _originalFrsp.get() , _originalQuery, _order ,
+ _mustAssertOnYieldFailure , BSONObj() , BSONObj() , _special ) ) );
+ return;
+ }
+ }
+ uassert( 13038 , (string)"can't find special index: " + _special + " for: " + _originalQuery.toString() , 0 );
+ }
+
+ if ( _honorRecordedPlan ) {
+ pair< BSONObj, long long > best = QueryUtilIndexed::bestIndexForPatterns( *_frsp, _order );
+ BSONObj bestIndex = best.first;
+ long long oldNScanned = best.second;
+ if ( !bestIndex.isEmpty() ) {
+ QueryPlanPtr p;
+ _oldNScanned = oldNScanned;
+ if ( !strcmp( bestIndex.firstElementFieldName(), "$natural" ) ) {
+ // Table scan plan
+ p.reset( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) );
+ }
+
+ NamespaceDetails::IndexIterator i = d->ii();
+ while( i.more() ) {
+ int j = i.pos();
+ IndexDetails& ii = i.next();
+ if( ii.keyPattern().woCompare(bestIndex) == 0 ) {
+ p.reset( new QueryPlan( d, j, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) );
+ }
+ }
+
+ massert( 10368 , "Unable to locate previously recorded index", p.get() );
+ if ( !( _bestGuessOnly && p->scanAndOrderRequired() ) ) {
+ _usingCachedPlan = true;
+ _plans.push_back( p );
+ return;
+ }
+ }
+ }
+
+ addOtherPlans( false );
+ }
+
+ void QueryPlanSet::addOtherPlans( bool checkFirst ) {
+ const char *ns = _frsp->ns();
+ NamespaceDetails *d = nsdetails( ns );
+ if ( !d )
+ return;
+
+ // If table scan is optimal or natural order requested or tailable cursor requested
+ if ( !_frsp->matchPossible() || ( _frsp->noNontrivialRanges() && _order.isEmpty() ) ||
+ ( !_order.isEmpty() && !strcmp( _order.firstElementFieldName(), "$natural" ) ) ) {
+ // Table scan plan
+ addPlan( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ), checkFirst );
+ return;
+ }
+
+ bool normalQuery = _hint.isEmpty() && _min.isEmpty() && _max.isEmpty();
+
+ PlanSet plans;
+ QueryPlanPtr optimalPlan;
+ QueryPlanPtr specialPlan;
+ for( int i = 0; i < d->nIndexes; ++i ) {
+ if ( normalQuery ) {
+ BSONObj keyPattern = d->idx( i ).keyPattern();
+ if ( !_frsp->matchPossibleForIndex( d, i, keyPattern ) ) {
+ // If no match is possible, only generate a trival plan that won't
+ // scan any documents.
+ QueryPlanPtr p( new QueryPlan( d, i, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) );
+ addPlan( p, checkFirst );
+ return;
+ }
+ if ( !QueryUtilIndexed::indexUseful( *_frsp, d, i, _order ) ) {
+ continue;
+ }
+ }
+
+ QueryPlanPtr p( new QueryPlan( d, i, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) );
+ if ( p->optimal() ) {
+ if ( !optimalPlan.get() ) {
+ optimalPlan = p;
+ }
+ }
+ else if ( !p->unhelpful() ) {
+ if ( p->special().empty() ) {
+ plans.push_back( p );
+ }
+ else {
+ specialPlan = p;
+ }
+ }
+ }
+ if ( optimalPlan.get() ) {
+ addPlan( optimalPlan, checkFirst );
+ return;
+ }
+ for( PlanSet::const_iterator i = plans.begin(); i != plans.end(); ++i ) {
+ addPlan( *i, checkFirst );
+ }
+
+ // Only add a special plan if no standard btree plans have been added. SERVER-4531
+ if ( plans.empty() && specialPlan ) {
+ addPlan( specialPlan, checkFirst );
+ return;
+ }
+
+ // Table scan plan
+ addPlan( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ), checkFirst );
+ _mayRecordPlan = true;
+ }
+
+ shared_ptr<QueryOp> QueryPlanSet::runOp( QueryOp &op ) {
+ if ( _usingCachedPlan ) {
+ Runner r( *this, op );
+ shared_ptr<QueryOp> res = r.runUntilFirstCompletes();
+ // _plans.size() > 1 if addOtherPlans was called in Runner::runUntilFirstCompletes().
+ if ( _bestGuessOnly || res->complete() || _plans.size() > 1 )
+ return res;
+ // A cached plan was used, so clear the plan for this query pattern and retry the query without a cached plan.
+ // Carefull here, as the namespace may have been dropped.
+ QueryUtilIndexed::clearIndexesForPatterns( *_frsp, _order );
+ init();
+ }
+ Runner r( *this, op );
+ return r.runUntilFirstCompletes();
+ }
+
+ shared_ptr<QueryOp> QueryPlanSet::nextOp( QueryOp &originalOp, bool retried ) {
+ if ( !_runner ) {
+ _runner.reset( new Runner( *this, originalOp ) );
+ shared_ptr<QueryOp> op = _runner->init();
+ if ( op->complete() ) {
+ return op;
+ }
+ }
+ shared_ptr<QueryOp> op = _runner->nextNonError();
+ if ( !op->error() ) {
+ return op;
+ }
+ if ( !_usingCachedPlan || _bestGuessOnly || _plans.size() > 1 ) {
+ return op;
+ }
+
+ // Avoid an infinite loop here - this should never occur.
+ verify( 15878, !retried );
+
+ // A cached plan was used, so clear the plan for this query pattern and retry the query without a cached plan.
+ QueryUtilIndexed::clearIndexesForPatterns( *_frsp, _order );
+ init();
+ return nextOp( originalOp, true );
+ }
+
+ bool QueryPlanSet::prepareToYield() {
+ return _runner ? _runner->prepareToYield() : true;
+ }
+
+ void QueryPlanSet::recoverFromYield() {
+ if ( _runner ) {
+ _runner->recoverFromYield();
+ }
+ }
+
+ void QueryPlanSet::clearRunner() {
+ if ( _runner ) {
+ _runner.reset();
+ }
+ }
+
+ BSONObj QueryPlanSet::explain() const {
+ vector<BSONObj> arr;
+ for( PlanSet::const_iterator i = _plans.begin(); i != _plans.end(); ++i ) {
+ shared_ptr<Cursor> c = (*i)->newCursor();
+ BSONObjBuilder explain;
+ explain.append( "cursor", c->toString() );
+ explain.append( "indexBounds", c->prettyIndexBounds() );
+ arr.push_back( explain.obj() );
+ }
+ BSONObjBuilder b;
+ b.append( "allPlans", arr );
+ return b.obj();
+ }
+
+ QueryPlanSet::QueryPlanPtr QueryPlanSet::getBestGuess() const {
+ assert( _plans.size() );
+ if ( _plans[ 0 ]->scanAndOrderRequired() ) {
+ for ( unsigned i=1; i<_plans.size(); i++ ) {
+ if ( ! _plans[i]->scanAndOrderRequired() )
+ return _plans[i];
+ }
+
+ warning() << "best guess query plan requested, but scan and order are required for all plans "
+ << " query: " << _originalQuery
+ << " order: " << _order
+ << " choices: ";
+
+ for ( unsigned i=0; i<_plans.size(); i++ )
+ warning() << _plans[i]->indexKey() << " ";
+ warning() << endl;
+
+ return QueryPlanPtr();
+ }
+ return _plans[0];
+ }
+
+ QueryPlanSet::Runner::Runner( QueryPlanSet &plans, QueryOp &op ) :
+ _op( op ),
+ _plans( plans ) {
+ }
+
+ bool QueryPlanSet::Runner::prepareToYield() {
+ for( vector<shared_ptr<QueryOp> >::const_iterator i = _ops.begin(); i != _ops.end(); ++i ) {
+ if ( !prepareToYieldOp( **i ) ) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ void QueryPlanSet::Runner::recoverFromYield() {
+ for( vector<shared_ptr<QueryOp> >::const_iterator i = _ops.begin(); i != _ops.end(); ++i ) {
+ recoverFromYieldOp( **i );
+ }
+ }
+
+ void QueryPlanSet::Runner::mayYield() {
+ if ( ! _plans._mayYield )
+ return;
+
+ if ( ! _plans._yieldSometimesTracker.intervalHasElapsed() )
+ return;
+
+ int micros = ClientCursor::suggestYieldMicros();
+ if ( micros <= 0 )
+ return;
+
+ if ( !prepareToYield() )
+ return;
+
+ ClientCursor::staticYield( micros , _plans._ns , 0 );
+ recoverFromYield();
+ }
+
+ shared_ptr<QueryOp> QueryPlanSet::Runner::init() {
+ massert( 10369 , "no plans", _plans._plans.size() > 0 );
+
+ if ( _plans._bestGuessOnly ) {
+ shared_ptr<QueryOp> op( _op.createChild() );
+ shared_ptr<QueryPlan> plan = _plans.getBestGuess();
+ massert( 15894, "no index matches QueryPlanSet's sort with _bestGuessOnly", plan.get() );
+ op->setQueryPlan( plan.get() );
+ _ops.push_back( op );
+ }
+ else {
+ if ( _plans._plans.size() > 1 )
+ log(1) << " running multiple plans" << endl;
+ for( PlanSet::iterator i = _plans._plans.begin(); i != _plans._plans.end(); ++i ) {
+ shared_ptr<QueryOp> op( _op.createChild() );
+ op->setQueryPlan( i->get() );
+ _ops.push_back( op );
+ }
+ }
+
+ // Initialize ops.
+ for( vector<shared_ptr<QueryOp> >::iterator i = _ops.begin(); i != _ops.end(); ++i ) {
+ initOp( **i );
+ if ( (*i)->complete() )
+ return *i;
+ }
+
+ // Put runnable ops in the priority queue.
+ for( vector<shared_ptr<QueryOp> >::iterator i = _ops.begin(); i != _ops.end(); ++i ) {
+ if ( !(*i)->error() ) {
+ _queue.push( *i );
+ }
+ }
+
+ return *_ops.begin();
+ }
+
+ shared_ptr<QueryOp> QueryPlanSet::Runner::nextNonError() {
+ if ( _queue.empty() ) {
+ return *_ops.begin();
+ }
+ shared_ptr<QueryOp> ret;
+ do {
+ ret = next();
+ } while( ret->error() && !_queue.empty() );
+ return ret;
+ }
+
+ shared_ptr<QueryOp> QueryPlanSet::Runner::next() {
+ mayYield();
+ dassert( !_queue.empty() );
+ OpHolder holder = _queue.pop();
+ QueryOp &op = *holder._op;
+ nextOp( op );
+ if ( op.complete() ) {
+ if ( _plans._mayRecordPlan && op.mayRecordPlan() ) {
+ op.qp().registerSelf( op.nscanned() );
+ }
+ return holder._op;
+ }
+ if ( op.error() ) {
+ return holder._op;
+ }
+ if ( !_plans._bestGuessOnly && _plans._usingCachedPlan && op.nscanned() > _plans._oldNScanned * 10 && _plans._special.empty() ) {
+ holder._offset = -op.nscanned();
+ _plans.addOtherPlans( /* avoid duplicating the initial plan */ true );
+ PlanSet::iterator i = _plans._plans.begin();
+ ++i;
+ for( ; i != _plans._plans.end(); ++i ) {
+ shared_ptr<QueryOp> op( _op.createChild() );
+ op->setQueryPlan( i->get() );
+ _ops.push_back( op );
+ initOp( *op );
+ if ( op->complete() )
+ return op;
+ _queue.push( op );
+ }
+ _plans._usingCachedPlan = false;
+ }
+ _queue.push( holder );
+ return holder._op;
+ }
+
+ shared_ptr<QueryOp> QueryPlanSet::Runner::runUntilFirstCompletes() {
+ shared_ptr<QueryOp> potentialFinisher = init();
+ if ( potentialFinisher->complete() ) {
+ return potentialFinisher;
+ }
+
+ while( !_queue.empty() ) {
+ shared_ptr<QueryOp> potentialFinisher = next();
+ if ( potentialFinisher->complete() ) {
+ return potentialFinisher;
+ }
+ }
+ return _ops[ 0 ];
+ }
+
+#define GUARD_OP_EXCEPTION( op, expression ) \
+ try { \
+ expression; \
+ } \
+ catch ( DBException& e ) { \
+ op.setException( e.getInfo() ); \
+ } \
+ catch ( const std::exception &e ) { \
+ op.setException( ExceptionInfo( e.what() , 0 ) ); \
+ } \
+ catch ( ... ) { \
+ op.setException( ExceptionInfo( "Caught unknown exception" , 0 ) ); \
+ }
+
+
+ void QueryPlanSet::Runner::initOp( QueryOp &op ) {
+ GUARD_OP_EXCEPTION( op, op.init() );
+ }
+
+ void QueryPlanSet::Runner::nextOp( QueryOp &op ) {
+ GUARD_OP_EXCEPTION( op, if ( !op.error() ) { op.next(); } );
+ }
+
+ bool QueryPlanSet::Runner::prepareToYieldOp( QueryOp &op ) {
+ GUARD_OP_EXCEPTION( op,
+ if ( op.error() ) {
+ return true;
+ }
+ else {
+ return op.prepareToYield();
+ } );
+ return true;
+ }
+
+ void QueryPlanSet::Runner::recoverFromYieldOp( QueryOp &op ) {
+ GUARD_OP_EXCEPTION( op, if ( !op.error() ) { op.recoverFromYield(); } );
+ }
+
+ /**
+ * NOTE on our $or implementation: In our current qo implementation we don't
+ * keep statistics on our data, but we can conceptualize the problem of
+ * selecting an index when statistics exist for all index ranges. The
+ * d-hitting set problem on k sets and n elements can be reduced to the
+ * problem of index selection on k $or clauses and n index ranges (where
+ * d is the max number of indexes, and the number of ranges n is unbounded).
+ * In light of the fact that d-hitting set is np complete, and we don't even
+ * track statistics (so cost calculations are expensive) our first
+ * implementation uses the following greedy approach: We take one $or clause
+ * at a time and treat each as a separate query for index selection purposes.
+ * But if an index range is scanned for a particular $or clause, we eliminate
+ * that range from all subsequent clauses. One could imagine an opposite
+ * implementation where we select indexes based on the union of index ranges
+ * for all $or clauses, but this can have much poorer worst case behavior.
+ * (An index range that suits one $or clause may not suit another, and this
+ * is worse than the typical case of index range choice staleness because
+ * with $or the clauses may likely be logically distinct.) The greedy
+ * implementation won't do any worse than all the $or clauses individually,
+ * and it can often do better. In the first cut we are intentionally using
+ * QueryPattern tracking to record successful plans on $or clauses for use by
+ * subsequent $or clauses, even though there may be a significant aggregate
+ * $nor component that would not be represented in QueryPattern.
+ */
+
+ MultiPlanScanner::MultiPlanScanner( const char *ns,
+ const BSONObj &query,
+ const BSONObj &order,
+ const BSONElement *hint,
+ bool honorRecordedPlan,
+ const BSONObj &min,
+ const BSONObj &max,
+ bool bestGuessOnly,
+ bool mayYield ) :
+ _ns( ns ),
+ _or( !query.getField( "$or" ).eoo() ),
+ _query( query.getOwned() ),
+ _i(),
+ _honorRecordedPlan( honorRecordedPlan ),
+ _bestGuessOnly( bestGuessOnly ),
+ _hint( ( hint && !hint->eoo() ) ? hint->wrap() : BSONObj() ),
+ _mayYield( mayYield ),
+ _tableScanned() {
+ if ( !order.isEmpty() || !min.isEmpty() || !max.isEmpty() ) {
+ _or = false;
+ }
+ if ( _or ) {
+ // Only construct an OrRangeGenerator if we may handle $or clauses.
+ _org.reset( new OrRangeGenerator( ns, _query ) );
+ if ( !_org->getSpecial().empty() ) {
+ _or = false;
+ }
+ else if ( uselessOr( _hint.firstElement() ) ) {
+ _or = false;
+ }
+ }
+ // if _or == false, don't use or clauses for index selection
+ if ( !_or ) {
+ auto_ptr<FieldRangeSetPair> frsp( new FieldRangeSetPair( ns, _query, true ) );
+ _currentQps.reset( new QueryPlanSet( ns, frsp, auto_ptr<FieldRangeSetPair>(), _query, order, false, hint, honorRecordedPlan, min, max, _bestGuessOnly, _mayYield ) );
+ }
+ else {
+ BSONElement e = _query.getField( "$or" );
+ massert( 13268, "invalid $or spec", e.type() == Array && e.embeddedObject().nFields() > 0 );
+ }
+ }
+
+ shared_ptr<QueryOp> MultiPlanScanner::runOpOnce( QueryOp &op ) {
+ assertMayRunMore();
+ if ( !_or ) {
+ ++_i;
+ return _currentQps->runOp( op );
+ }
+ ++_i;
+ auto_ptr<FieldRangeSetPair> frsp( _org->topFrsp() );
+ auto_ptr<FieldRangeSetPair> originalFrsp( _org->topFrspOriginal() );
+ BSONElement hintElt = _hint.firstElement();
+ _currentQps.reset( new QueryPlanSet( _ns, frsp, originalFrsp, _query, BSONObj(), true, &hintElt, _honorRecordedPlan, BSONObj(), BSONObj(), _bestGuessOnly, _mayYield ) );
+ shared_ptr<QueryOp> ret( _currentQps->runOp( op ) );
+ if ( ! ret->complete() )
+ throw MsgAssertionException( ret->exception() );
+ if ( ret->qp().willScanTable() ) {
+ _tableScanned = true;
+ } else {
+ // If the full table was scanned, don't bother popping the last or clause.
+ _org->popOrClause( ret->qp().nsd(), ret->qp().idxNo(), ret->qp().indexed() ? ret->qp().indexKey() : BSONObj() );
+ }
+ return ret;
+ }
+
+ shared_ptr<QueryOp> MultiPlanScanner::runOp( QueryOp &op ) {
+ shared_ptr<QueryOp> ret = runOpOnce( op );
+ while( !ret->stopRequested() && mayRunMore() ) {
+ ret = runOpOnce( *ret );
+ }
+ return ret;
+ }
+
+ shared_ptr<QueryOp> MultiPlanScanner::nextOpHandleEndOfClause() {
+ shared_ptr<QueryOp> op = _currentQps->nextOp( *_baseOp );
+ if ( !op->complete() ) {
+ return op;
+ }
+ if ( op->qp().willScanTable() ) {
+ _tableScanned = true;
+ } else {
+ _org->popOrClause( op->qp().nsd(), op->qp().idxNo(), op->qp().indexed() ? op->qp().indexKey() : BSONObj() );
+ }
+ return op;
+ }
+
+ shared_ptr<QueryOp> MultiPlanScanner::nextOpBeginningClause() {
+ assertMayRunMore();
+ shared_ptr<QueryOp> op;
+ while( mayRunMore() ) {
+ ++_i;
+ auto_ptr<FieldRangeSetPair> frsp( _org->topFrsp() );
+ auto_ptr<FieldRangeSetPair> originalFrsp( _org->topFrspOriginal() );
+ BSONElement hintElt = _hint.firstElement();
+ _currentQps.reset( new QueryPlanSet( _ns, frsp, originalFrsp, _query, BSONObj(), true, &hintElt, _honorRecordedPlan, BSONObj(), BSONObj(), _bestGuessOnly, _mayYield ) );
+ op = nextOpHandleEndOfClause();
+ if ( !op->complete() ) {
+ return op;
+ }
+ _baseOp = op;
+ }
+ return op;
+ }
+
+ shared_ptr<QueryOp> MultiPlanScanner::nextOp() {
+ if ( !_or ) {
+ if ( _i == 0 ) {
+ assertMayRunMore();
+ ++_i;
+ }
+ return _currentQps->nextOp( *_baseOp );
+ }
+ if ( _i == 0 ) {
+ return nextOpBeginningClause();
+ }
+ shared_ptr<QueryOp> op = nextOpHandleEndOfClause();
+ if ( !op->complete() ) {
+ return op;
+ }
+ if ( !op->stopRequested() && mayRunMore() ) {
+ // Finished scanning the clause, but stop hasn't been requested.
+ // Start scanning the next clause.
+ _baseOp = op;
+ return nextOpBeginningClause();
+ }
+ return op;
+ }
+
+ bool MultiPlanScanner::prepareToYield() {
+ return _currentQps.get() ? _currentQps->prepareToYield() : true;
+ }
+
+ void MultiPlanScanner::recoverFromYield() {
+ if ( _currentQps.get() ) {
+ _currentQps->recoverFromYield();
+ }
+ }
+
+ void MultiPlanScanner::clearRunner() {
+ if ( _currentQps.get() ) {
+ _currentQps->clearRunner();
+ }
+ }
+
+ int MultiPlanScanner::currentNPlans() const {
+ return _currentQps.get() ? _currentQps->nPlans() : 0;
+ }
+
+ shared_ptr<Cursor> MultiPlanScanner::singleCursor() const {
+ const QueryPlan *qp = singlePlan();
+ if ( !qp ) {
+ return shared_ptr<Cursor>();
+ }
+ // If there is only one plan and it does not require an in memory
+ // sort, we do not expect its cursor op to throw an exception and
+ // so do not need a QueryOptimizerCursor to handle this case.
+ return qp->newCursor();
+ }
+
+ const QueryPlan *MultiPlanScanner::singlePlan() const {
+ if ( _or || _currentQps->nPlans() != 1 || _currentQps->firstPlan()->scanAndOrderRequired() || _currentQps->usingCachedPlan() ) {
+ return 0;
+ }
+ return _currentQps->firstPlan().get();
+ }
+
+ bool MultiPlanScanner::uselessOr( const BSONElement &hint ) const {
+ NamespaceDetails *nsd = nsdetails( _ns );
+ if ( !nsd ) {
+ return true;
+ }
+ if ( !hint.eoo() ) {
+ IndexDetails *id = parseHint( hint, nsd );
+ if ( !id ) {
+ return true;
+ }
+ return QueryUtilIndexed::uselessOr( *_org, nsd, nsd->idxNo( *id ) );
+ }
+ return QueryUtilIndexed::uselessOr( *_org, nsd, -1 );
+ }
+
+ MultiCursor::MultiCursor( const char *ns, const BSONObj &pattern, const BSONObj &order, shared_ptr<CursorOp> op, bool mayYield )
+ : _mps( new MultiPlanScanner( ns, pattern, order, 0, true, BSONObj(), BSONObj(), !op.get(), mayYield ) ), _nscanned() {
+ if ( op.get() ) {
+ _op = op;
+ }
+ else {
+ _op.reset( new NoOp() );
+ }
+ if ( _mps->mayRunMore() ) {
+ nextClause();
+ if ( !ok() ) {
+ advance();
+ }
+ }
+ else {
+ _c.reset( new BasicCursor( DiskLoc() ) );
+ }
+ }
+
+ MultiCursor::MultiCursor( auto_ptr<MultiPlanScanner> mps, const shared_ptr<Cursor> &c, const shared_ptr<CoveredIndexMatcher> &matcher, const QueryOp &op, long long nscanned )
+ : _op( new NoOp( op ) ), _c( c ), _mps( mps ), _matcher( matcher ), _nscanned( nscanned ) {
+ _mps->setBestGuessOnly();
+ _mps->mayYield( false ); // with a NoOp, there's no need to yield in QueryPlanSet
+ if ( !ok() ) {
+ // would have been advanced by UserQueryOp if possible
+ advance();
+ }
+ }
+
+ void MultiCursor::nextClause() {
+ if ( _nscanned >= 0 && _c.get() ) {
+ _nscanned += _c->nscanned();
+ }
+ shared_ptr<CursorOp> best = _mps->runOpOnce( *_op );
+ if ( ! best->complete() )
+ throw MsgAssertionException( best->exception() );
+ _c = best->newCursor();
+ _matcher = best->matcher( _c );
+ _op = best;
+ }
+
+ bool indexWorks( const BSONObj &idxPattern, const BSONObj &sampleKey, int direction, int firstSignificantField ) {
+ BSONObjIterator p( idxPattern );
+ BSONObjIterator k( sampleKey );
+ int i = 0;
+ while( 1 ) {
+ BSONElement pe = p.next();
+ BSONElement ke = k.next();
+ if ( pe.eoo() && ke.eoo() )
+ return true;
+ if ( pe.eoo() || ke.eoo() )
+ return false;
+ if ( strcmp( pe.fieldName(), ke.fieldName() ) != 0 )
+ return false;
+ if ( ( i == firstSignificantField ) && !( ( direction > 0 ) == ( pe.number() > 0 ) ) )
+ return false;
+ ++i;
+ }
+ return false;
+ }
+
+ BSONObj extremeKeyForIndex( const BSONObj &idxPattern, int baseDirection ) {
+ BSONObjIterator i( idxPattern );
+ BSONObjBuilder b;
+ while( i.moreWithEOO() ) {
+ BSONElement e = i.next();
+ if ( e.eoo() )
+ break;
+ int idxDirection = e.number() >= 0 ? 1 : -1;
+ int direction = idxDirection * baseDirection;
+ switch( direction ) {
+ case 1:
+ b.appendMaxKey( e.fieldName() );
+ break;
+ case -1:
+ b.appendMinKey( e.fieldName() );
+ break;
+ default:
+ assert( false );
+ }
+ }
+ return b.obj();
+ }
+
+ pair<int,int> keyAudit( const BSONObj &min, const BSONObj &max ) {
+ int direction = 0;
+ int firstSignificantField = 0;
+ BSONObjIterator i( min );
+ BSONObjIterator a( max );
+ while( 1 ) {
+ BSONElement ie = i.next();
+ BSONElement ae = a.next();
+ if ( ie.eoo() && ae.eoo() )
+ break;
+ if ( ie.eoo() || ae.eoo() || strcmp( ie.fieldName(), ae.fieldName() ) != 0 ) {
+ return make_pair( -1, -1 );
+ }
+ int cmp = ie.woCompare( ae );
+ if ( cmp < 0 )
+ direction = 1;
+ if ( cmp > 0 )
+ direction = -1;
+ if ( direction != 0 )
+ break;
+ ++firstSignificantField;
+ }
+ return make_pair( direction, firstSignificantField );
+ }
+
+ pair<int,int> flexibleKeyAudit( const BSONObj &min, const BSONObj &max ) {
+ if ( min.isEmpty() || max.isEmpty() ) {
+ return make_pair( 1, -1 );
+ }
+ else {
+ return keyAudit( min, max );
+ }
+ }
+
+ // NOTE min, max, and keyPattern will be updated to be consistent with the selected index.
+ IndexDetails *indexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern ) {
+ if ( min.isEmpty() && max.isEmpty() ) {
+ errmsg = "one of min or max must be specified";
+ return 0;
+ }
+
+ Client::Context ctx( ns );
+ IndexDetails *id = 0;
+ NamespaceDetails *d = nsdetails( ns );
+ if ( !d ) {
+ errmsg = "ns not found";
+ return 0;
+ }
+
+ pair<int,int> ret = flexibleKeyAudit( min, max );
+ if ( ret == make_pair( -1, -1 ) ) {
+ errmsg = "min and max keys do not share pattern";
+ return 0;
+ }
+ if ( keyPattern.isEmpty() ) {
+ NamespaceDetails::IndexIterator i = d->ii();
+ while( i.more() ) {
+ IndexDetails& ii = i.next();
+ if ( indexWorks( ii.keyPattern(), min.isEmpty() ? max : min, ret.first, ret.second ) ) {
+ if ( ii.getSpec().getType() == 0 ) {
+ id = &ii;
+ keyPattern = ii.keyPattern();
+ break;
+ }
+ }
+ }
+
+ }
+ else {
+ if ( !indexWorks( keyPattern, min.isEmpty() ? max : min, ret.first, ret.second ) ) {
+ errmsg = "requested keyPattern does not match specified keys";
+ return 0;
+ }
+ NamespaceDetails::IndexIterator i = d->ii();
+ while( i.more() ) {
+ IndexDetails& ii = i.next();
+ if( ii.keyPattern().woCompare(keyPattern) == 0 ) {
+ id = &ii;
+ break;
+ }
+ if ( keyPattern.nFields() == 1 && ii.keyPattern().nFields() == 1 &&
+ IndexDetails::isIdIndexPattern( keyPattern ) &&
+ ii.isIdIndex() ) {
+ id = &ii;
+ break;
+ }
+
+ }
+ }
+
+ if ( min.isEmpty() ) {
+ min = extremeKeyForIndex( keyPattern, -1 );
+ }
+ else if ( max.isEmpty() ) {
+ max = extremeKeyForIndex( keyPattern, 1 );
+ }
+
+ if ( !id ) {
+ errmsg = str::stream() << "no index found for specified keyPattern: " << keyPattern.toString()
+ << " min: " << min << " max: " << max;
+ return 0;
+ }
+
+ min = min.extractFieldsUnDotted( keyPattern );
+ max = max.extractFieldsUnDotted( keyPattern );
+
+ return id;
+ }
+
+ bool isSimpleIdQuery( const BSONObj& query ) {
+ BSONObjIterator i(query);
+
+ if( !i.more() )
+ return false;
+
+ BSONElement e = i.next();
+
+ if( i.more() )
+ return false;
+
+ if( strcmp("_id", e.fieldName()) != 0 )
+ return false;
+
+ if ( e.isSimpleType() ) // e.g. not something like { _id : { $gt : ...
+ return true;
+
+ if ( e.type() == Object )
+ return e.Obj().firstElementFieldName()[0] != '$';
+
+ return false;
+ }
+
+ shared_ptr<Cursor> bestGuessCursor( const char *ns, const BSONObj &query, const BSONObj &sort ) {
+ if( !query.getField( "$or" ).eoo() ) {
+ return shared_ptr<Cursor>( new MultiCursor( ns, query, sort ) );
+ }
+ else {
+ auto_ptr<FieldRangeSetPair> frsp( new FieldRangeSetPair( ns, query, true ) );
+ auto_ptr<FieldRangeSetPair> origFrsp( new FieldRangeSetPair( *frsp ) );
+
+ QueryPlanSet qps( ns, frsp, origFrsp, query, sort, false );
+ QueryPlanSet::QueryPlanPtr qpp = qps.getBestGuess();
+ if( ! qpp.get() ) return shared_ptr<Cursor>();
+
+ shared_ptr<Cursor> ret = qpp->newCursor();
+
+ // If we don't already have a matcher, supply one.
+ if ( !query.isEmpty() && ! ret->matcher() ) {
+ shared_ptr<CoveredIndexMatcher> matcher( new CoveredIndexMatcher( query, ret->indexKeyPattern() ) );
+ ret->setMatcher( matcher );
+ }
+ return ret;
+ }
+ }
+
+ bool QueryUtilIndexed::indexUseful( const FieldRangeSetPair &frsp, NamespaceDetails *d, int idxNo, const BSONObj &order ) {
+ DEV frsp.assertValidIndex( d, idxNo );
+ BSONObj keyPattern = d->idx( idxNo ).keyPattern();
+ if ( !frsp.matchPossibleForIndex( d, idxNo, keyPattern ) ) {
+ // No matches are possible in the index so the index may be useful.
+ return true;
+ }
+ return d->idx( idxNo ).getSpec().suitability( frsp.simplifiedQueryForIndex( d, idxNo, keyPattern ), order ) != USELESS;
+ }
+
+ void QueryUtilIndexed::clearIndexesForPatterns( const FieldRangeSetPair &frsp, const BSONObj &order ) {
+ SimpleMutex::scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
+ NamespaceDetailsTransient& nsd = NamespaceDetailsTransient::get_inlock( frsp.ns() );
+ nsd.registerIndexForPattern( frsp._singleKey.pattern( order ), BSONObj(), 0 );
+ nsd.registerIndexForPattern( frsp._multiKey.pattern( order ), BSONObj(), 0 );
+ }
+
+ pair< BSONObj, long long > QueryUtilIndexed::bestIndexForPatterns( const FieldRangeSetPair &frsp, const BSONObj &order ) {
+ SimpleMutex::scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
+ NamespaceDetailsTransient& nsd = NamespaceDetailsTransient::get_inlock( frsp.ns() );
+ // TODO Maybe it would make sense to return the index with the lowest
+ // nscanned if there are two possibilities.
+ if ( frsp._singleKey.matchPossible() ) {
+ QueryPattern pattern = frsp._singleKey.pattern( order );
+ BSONObj oldIdx = nsd.indexForPattern( pattern );
+ if ( !oldIdx.isEmpty() ) {
+ long long oldNScanned = nsd.nScannedForPattern( pattern );
+ return make_pair( oldIdx, oldNScanned );
+ }
+ }
+ if ( frsp._multiKey.matchPossible() ) {
+ QueryPattern pattern = frsp._multiKey.pattern( order );
+ BSONObj oldIdx = nsd.indexForPattern( pattern );
+ if ( !oldIdx.isEmpty() ) {
+ long long oldNScanned = nsd.nScannedForPattern( pattern );
+ return make_pair( oldIdx, oldNScanned );
+ }
+ }
+ return make_pair( BSONObj(), 0 );
+ }
+
+ bool QueryUtilIndexed::uselessOr( const OrRangeGenerator &org, NamespaceDetails *d, int hintIdx ) {
+ for( list<FieldRangeSetPair>::const_iterator i = org._originalOrSets.begin(); i != org._originalOrSets.end(); ++i ) {
+ if ( hintIdx != -1 ) {
+ if ( !indexUseful( *i, d, hintIdx, BSONObj() ) ) {
+ return true;
+ }
+ }
+ else {
+ bool useful = false;
+ for( int j = 0; j < d->nIndexes; ++j ) {
+ if ( indexUseful( *i, d, j, BSONObj() ) ) {
+ useful = true;
+ break;
+ }
+ }
+ if ( !useful ) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/queryoptimizer.h b/src/mongo/db/queryoptimizer.h
new file mode 100644
index 00000000000..297c6fe9505
--- /dev/null
+++ b/src/mongo/db/queryoptimizer.h
@@ -0,0 +1,599 @@
+// @file queryoptimizer.h
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "cursor.h"
+#include "jsobj.h"
+#include "queryutil.h"
+#include "matcher.h"
+#include "../util/net/listen.h"
+#include <queue>
+
+namespace mongo {
+
+ class IndexDetails;
+ class IndexType;
+ class ElapsedTracker;
+
+ /** A plan for executing a query using the given index spec and FieldRangeSet. */
+ class QueryPlan : boost::noncopyable {
+ public:
+
+ /**
+ * @param originalFrsp - original constraints for this query clause. If null, frsp will be used instead.
+ */
+ QueryPlan(NamespaceDetails *d,
+ int idxNo, // -1 = no index
+ const FieldRangeSetPair &frsp,
+ const FieldRangeSetPair *originalFrsp,
+ const BSONObj &originalQuery,
+ const BSONObj &order,
+ bool mustAssertOnYieldFailure = true,
+ const BSONObj &startKey = BSONObj(),
+ const BSONObj &endKey = BSONObj(),
+ string special="" );
+
+ /** @return true iff no other plans should be considered. */
+ bool optimal() const { return _optimal; }
+ /* @return true iff this plan should not be considered at all. */
+ bool unhelpful() const { return _unhelpful; }
+ /** @return true iff ScanAndOrder processing will be required for result set. */
+ bool scanAndOrderRequired() const { return _scanAndOrderRequired; }
+ /**
+ * @return true iff the index we are using has keys such that it can completely resolve the
+ * query expression to match by itself without ever checking the main object.
+ */
+ bool exactKeyMatch() const { return _exactKeyMatch; }
+ /** @return true iff this QueryPlan would perform an unindexed scan. */
+ bool willScanTable() const { return _idxNo < 0 && !_impossible; }
+ /** @return 'special' attribute of the plan, which was either set explicitly or generated from the index. */
+ const string &special() const { return _special; }
+
+ /** @return a new cursor based on this QueryPlan's index and FieldRangeSet. */
+ shared_ptr<Cursor> newCursor( const DiskLoc &startLoc = DiskLoc() , int numWanted=0 ) const;
+ /** @return a new reverse cursor if this is an unindexed plan. */
+ shared_ptr<Cursor> newReverseCursor() const;
+ /** Register this plan as a winner for its QueryPattern, with specified 'nscanned'. */
+ void registerSelf( long long nScanned ) const;
+
+ int direction() const { return _direction; }
+ BSONObj indexKey() const;
+ bool indexed() const { return _index; }
+ int idxNo() const { return _idxNo; }
+ const char *ns() const { return _frs.ns(); }
+ NamespaceDetails *nsd() const { return _d; }
+ BSONObj originalQuery() const { return _originalQuery; }
+ BSONObj simplifiedQuery( const BSONObj& fields = BSONObj() ) const { return _frs.simplifiedQuery( fields ); }
+ const FieldRange &range( const char *fieldName ) const { return _frs.range( fieldName ); }
+ shared_ptr<FieldRangeVector> originalFrv() const { return _originalFrv; }
+
+ const FieldRangeSet &multikeyFrs() const { return _frsMulti; }
+
+ bool mustAssertOnYieldFailure() const { return _mustAssertOnYieldFailure; }
+
+ /** The following member functions are just for testing. */
+
+ shared_ptr<FieldRangeVector> frv() const { return _frv; }
+ bool isMultiKey() const;
+
+ private:
+ NamespaceDetails * _d;
+ int _idxNo;
+ const FieldRangeSet &_frs;
+ const FieldRangeSet &_frsMulti;
+ const BSONObj &_originalQuery;
+ const BSONObj &_order;
+ const IndexDetails * _index;
+ bool _optimal;
+ bool _scanAndOrderRequired;
+ bool _exactKeyMatch;
+ int _direction;
+ shared_ptr<FieldRangeVector> _frv;
+ shared_ptr<FieldRangeVector> _originalFrv;
+ BSONObj _startKey;
+ BSONObj _endKey;
+ bool _endKeyInclusive;
+ bool _unhelpful;
+ bool _impossible;
+ string _special;
+ IndexType * _type;
+ bool _startOrEndSpec;
+ bool _mustAssertOnYieldFailure;
+ };
+
+ /**
+ * Inherit from this interface to implement a new query operation.
+ * The query optimizer will clone the QueryOp that is provided, giving
+ * each clone its own query plan.
+ *
+ * Normal sequence of events:
+ * 1) A new QueryOp is generated using createChild().
+ * 2) A QueryPlan is assigned to this QueryOp with setQueryPlan().
+ * 3) _init() is called on the QueryPlan.
+ * 4) next() is called repeatedly, with nscanned() checked after each call.
+ * 5) In one of these calls to next(), setComplete() is called.
+ * 6) The QueryPattern for the QueryPlan may be recorded as a winner.
+ */
+ class QueryOp {
+ public:
+ QueryOp() : _complete(), _stopRequested(), _qp(), _error() {}
+
+ /** Used when handing off from one QueryOp to another. */
+ QueryOp( const QueryOp &other ) :
+ _complete(), _stopRequested(), _qp(), _error(), _matcher( other._matcher ),
+ _orConstraint( other._orConstraint ) {}
+
+ virtual ~QueryOp() {}
+
+ /** @return QueryPlan assigned to this QueryOp by the query optimizer. */
+ const QueryPlan &qp() const { return *_qp; }
+
+ /** Advance to next potential matching document (eg using a cursor). */
+ virtual void next() = 0;
+ /**
+ * @return current 'nscanned' metric for this QueryOp. Used to compare
+ * cost to other QueryOps.
+ */
+ virtual long long nscanned() = 0;
+ /** Take any steps necessary before the db mutex is yielded. */
+ virtual bool prepareToYield() { massert( 13335, "yield not supported", false ); return false; }
+ /** Recover once the db mutex is regained. */
+ virtual void recoverFromYield() { massert( 13336, "yield not supported", false ); }
+
+ /**
+ * @return true iff the QueryPlan for this QueryOp may be registered
+ * as a winning plan.
+ */
+ virtual bool mayRecordPlan() const = 0;
+
+ /** @return true iff the implementation called setComplete() or setStop(). */
+ bool complete() const { return _complete; }
+ /** @return true iff the implementation called steStop(). */
+ bool stopRequested() const { return _stopRequested; }
+ /** @return true iff the implementation threw an exception. */
+ bool error() const { return _error; }
+ /** @return the exception thrown by implementation if one was thrown. */
+ ExceptionInfo exception() const { return _exception; }
+
+ /** To be called by QueryPlanSet::Runner only. */
+
+ QueryOp *createChild();
+ void setQueryPlan( const QueryPlan *qp ) { _qp = qp; assert( _qp != NULL ); }
+ void init();
+ void setException( const DBException &e ) {
+ _error = true;
+ _exception = e.getInfo();
+ }
+
+ shared_ptr<CoveredIndexMatcher> matcher( const shared_ptr<Cursor>& c ) const {
+ return matcher( c.get() );
+ }
+ shared_ptr<CoveredIndexMatcher> matcher( Cursor* c ) const {
+ if( ! c ) return _matcher;
+ return c->matcher() ? c->matcherPtr() : _matcher;
+ }
+
+ protected:
+ /** Call if all results have been found. */
+ void setComplete() {
+ _orConstraint = qp().originalFrv();
+ _complete = true;
+ }
+ /** Call if the scan is complete even if not all results have been found. */
+ void setStop() { setComplete(); _stopRequested = true; }
+
+ /** Handle initialization after a QueryPlan has been set. */
+ virtual void _init() = 0;
+
+ /** @return a copy of the inheriting class, which will be run with its own query plan. */
+ virtual QueryOp *_createChild() const = 0;
+
+ virtual bool alwaysUseRecord() const { return false; }
+
+ private:
+ bool _complete;
+ bool _stopRequested;
+ ExceptionInfo _exception;
+ const QueryPlan *_qp;
+ bool _error;
+ shared_ptr<CoveredIndexMatcher> _matcher;
+ shared_ptr<CoveredIndexMatcher> _oldMatcher;
+ shared_ptr<FieldRangeVector> _orConstraint;
+ };
+
+ // temp. this class works if T::operator< is variant unlike a regular stl priority queue.
+ // but it's very slow. however if v.size() is always very small, it would be fine,
+ // maybe even faster than a smart impl that does more memory allocations.
+ template<class T>
+ class our_priority_queue : boost::noncopyable {
+ vector<T> v;
+ public:
+ our_priority_queue() {
+ v.reserve(4);
+ }
+ int size() const { return v.size(); }
+ bool empty() const { return v.empty(); }
+ void push(const T & x) {
+ v.push_back(x);
+ }
+ T pop() {
+ size_t t = 0;
+ for( size_t i = 1; i < v.size(); i++ ) {
+ if( v[t] < v[i] )
+ t = i;
+ }
+ T ret = v[t];
+ v.erase(v.begin()+t);
+ return ret;
+ }
+ };
+
+ /**
+ * A set of candidate query plans for a query. This class can return a best buess plan or run a
+ * QueryOp on all the plans.
+ */
+ class QueryPlanSet {
+ public:
+
+ typedef boost::shared_ptr<QueryPlan> QueryPlanPtr;
+ typedef vector<QueryPlanPtr> PlanSet;
+
+ /**
+ * @param originalFrsp - original constraints for this query clause; if null, frsp will be used.
+ */
+ QueryPlanSet( const char *ns,
+ auto_ptr<FieldRangeSetPair> frsp,
+ auto_ptr<FieldRangeSetPair> originalFrsp,
+ const BSONObj &originalQuery,
+ const BSONObj &order,
+ bool mustAssertOnYieldFailure = true,
+ const BSONElement *hint = 0,
+ bool honorRecordedPlan = true,
+ const BSONObj &min = BSONObj(),
+ const BSONObj &max = BSONObj(),
+ bool bestGuessOnly = false,
+ bool mayYield = false);
+
+ /** @return number of candidate plans. */
+ int nPlans() const { return _plans.size(); }
+
+ /**
+ * Clone op for each query plan, and @return the first cloned op to call
+ * setComplete() or setStop().
+ */
+
+ shared_ptr<QueryOp> runOp( QueryOp &op );
+ template<class T>
+ shared_ptr<T> runOp( T &op ) {
+ return dynamic_pointer_cast<T>( runOp( static_cast<QueryOp&>( op ) ) );
+ }
+
+ /** Initialize or iterate a runner generated from @param originalOp. */
+ shared_ptr<QueryOp> nextOp( QueryOp &originalOp, bool retried = false );
+
+ /** Yield the runner member. */
+
+ bool prepareToYield();
+ void recoverFromYield();
+
+ /** Clear the runner member. */
+ void clearRunner();
+
+ QueryPlanPtr firstPlan() const { return _plans[ 0 ]; }
+
+ /** @return metadata about cursors and index bounds for all plans, suitable for explain output. */
+ BSONObj explain() const;
+ /** @return true iff a plan is selected based on previous success of this plan. */
+ bool usingCachedPlan() const { return _usingCachedPlan; }
+ /** @return a single plan that may work well for the specified query. */
+ QueryPlanPtr getBestGuess() const;
+
+ //for testing
+ const FieldRangeSetPair &frsp() const { return *_frsp; }
+ const FieldRangeSetPair *originalFrsp() const { return _originalFrsp.get(); }
+ bool modifiedKeys() const;
+ bool hasMultiKey() const;
+
+ private:
+ void addOtherPlans( bool checkFirst );
+ void addPlan( QueryPlanPtr plan, bool checkFirst ) {
+ if ( checkFirst && plan->indexKey().woCompare( _plans[ 0 ]->indexKey() ) == 0 )
+ return;
+ _plans.push_back( plan );
+ }
+ void init();
+ void addHint( IndexDetails &id );
+ class Runner {
+ public:
+ Runner( QueryPlanSet &plans, QueryOp &op );
+
+ /**
+ * Iterate interactively through candidate documents on all plans.
+ * QueryOp objects are returned at each interleaved step.
+ */
+
+ /** @return a plan that has completed, otherwise an arbitrary plan. */
+ shared_ptr<QueryOp> init();
+ /**
+ * Move the Runner forward one iteration, and @return the plan for
+ * this iteration.
+ */
+ shared_ptr<QueryOp> next();
+ /** @return next non error op if there is one, otherwise an error op. */
+ shared_ptr<QueryOp> nextNonError();
+
+ bool prepareToYield();
+ void recoverFromYield();
+
+ /** Run until first op completes. */
+ shared_ptr<QueryOp> runUntilFirstCompletes();
+
+ void mayYield();
+ QueryOp &_op;
+ QueryPlanSet &_plans;
+ static void initOp( QueryOp &op );
+ static void nextOp( QueryOp &op );
+ static bool prepareToYieldOp( QueryOp &op );
+ static void recoverFromYieldOp( QueryOp &op );
+ private:
+ vector<shared_ptr<QueryOp> > _ops;
+ struct OpHolder {
+ OpHolder( const shared_ptr<QueryOp> &op ) : _op( op ), _offset() {}
+ shared_ptr<QueryOp> _op;
+ long long _offset;
+ bool operator<( const OpHolder &other ) const {
+ return _op->nscanned() + _offset > other._op->nscanned() + other._offset;
+ }
+ };
+ our_priority_queue<OpHolder> _queue;
+ };
+
+ const char *_ns;
+ BSONObj _originalQuery;
+ auto_ptr<FieldRangeSetPair> _frsp;
+ auto_ptr<FieldRangeSetPair> _originalFrsp;
+ PlanSet _plans;
+ bool _mayRecordPlan;
+ bool _usingCachedPlan;
+ BSONObj _hint;
+ BSONObj _order;
+ long long _oldNScanned;
+ bool _honorRecordedPlan;
+ BSONObj _min;
+ BSONObj _max;
+ string _special;
+ bool _bestGuessOnly;
+ bool _mayYield;
+ ElapsedTracker _yieldSometimesTracker;
+ shared_ptr<Runner> _runner;
+ bool _mustAssertOnYieldFailure;
+ };
+
+ /** Handles $or type queries by generating a QueryPlanSet for each $or clause. */
+ class MultiPlanScanner {
+ public:
+ MultiPlanScanner( const char *ns,
+ const BSONObj &query,
+ const BSONObj &order,
+ const BSONElement *hint = 0,
+ bool honorRecordedPlan = true,
+ const BSONObj &min = BSONObj(),
+ const BSONObj &max = BSONObj(),
+ bool bestGuessOnly = false,
+ bool mayYield = false);
+
+ /**
+ * Clone op for each query plan of a single $or clause, and @return the first cloned op
+ * to call setComplete() or setStop().
+ */
+
+ shared_ptr<QueryOp> runOpOnce( QueryOp &op );
+ template<class T>
+ shared_ptr<T> runOpOnce( T &op ) {
+ return dynamic_pointer_cast<T>( runOpOnce( static_cast<QueryOp&>( op ) ) );
+ }
+
+ /**
+ * For each $or clause, calls runOpOnce on the child QueryOp cloned from the winning QueryOp
+ * of the previous $or clause (or from the supplied 'op' for the first $or clause).
+ */
+
+ shared_ptr<QueryOp> runOp( QueryOp &op );
+ template<class T>
+ shared_ptr<T> runOp( T &op ) {
+ return dynamic_pointer_cast<T>( runOp( static_cast<QueryOp&>( op ) ) );
+ }
+
+ /** Initialize or iterate a runner generated from @param originalOp. */
+
+ void initialOp( const shared_ptr<QueryOp> &originalOp ) { _baseOp = originalOp; }
+ shared_ptr<QueryOp> nextOp();
+
+ /** Yield the runner member. */
+
+ bool prepareToYield();
+ void recoverFromYield();
+
+ /** Clear the runner member. */
+ void clearRunner();
+
+ int currentNPlans() const;
+
+ /**
+ * @return a single simple cursor if the scanner would run a single cursor
+ * for this query, otherwise return an empty shared_ptr.
+ */
+ shared_ptr<Cursor> singleCursor() const;
+
+ /**
+ * @return the query plan that would be used if the scanner would run a single
+ * cursor for this query, otherwise 0. The returned plan is invalid if this
+ * MultiPlanScanner is destroyed, hence we return a raw pointer.
+ */
+ const QueryPlan *singlePlan() const;
+
+ /** @return true iff more $or clauses need to be scanned. */
+ bool mayRunMore() const { return _or ? ( !_tableScanned && !_org->orFinished() ) : _i == 0; }
+ /** @return non-$or version of explain output. */
+ BSONObj oldExplain() const { assertNotOr(); return _currentQps->explain(); }
+ /** @return true iff this is not a $or query and a plan is selected based on previous success of this plan. */
+ bool usingCachedPlan() const { return !_or && _currentQps->usingCachedPlan(); }
+ /** Don't attempt to scan multiple plans, just use the best guess. */
+ void setBestGuessOnly() { _bestGuessOnly = true; }
+ /** Yielding is allowed while running each QueryPlan. */
+ void mayYield( bool val ) { _mayYield = val; }
+ bool modifiedKeys() const { return _currentQps->modifiedKeys(); }
+ bool hasMultiKey() const { return _currentQps->hasMultiKey(); }
+
+ private:
+ void assertNotOr() const {
+ massert( 13266, "not implemented for $or query", !_or );
+ }
+ void assertMayRunMore() const {
+ massert( 13271, "can't run more ops", mayRunMore() );
+ }
+ shared_ptr<QueryOp> nextOpBeginningClause();
+ shared_ptr<QueryOp> nextOpHandleEndOfClause();
+ bool uselessOr( const BSONElement &hint ) const;
+ const char * _ns;
+ bool _or;
+ BSONObj _query;
+ shared_ptr<OrRangeGenerator> _org; // May be null in certain non $or query cases.
+ auto_ptr<QueryPlanSet> _currentQps;
+ int _i;
+ bool _honorRecordedPlan;
+ bool _bestGuessOnly;
+ BSONObj _hint;
+ bool _mayYield;
+ bool _tableScanned;
+ shared_ptr<QueryOp> _baseOp;
+ };
+
+ /** Provides a cursor interface for certain limited uses of a MultiPlanScanner. */
+ class MultiCursor : public Cursor {
+ public:
+ class CursorOp : public QueryOp {
+ public:
+ CursorOp() {}
+ CursorOp( const QueryOp &other ) : QueryOp( other ) {}
+ virtual shared_ptr<Cursor> newCursor() const = 0;
+ };
+ /** takes ownership of 'op' */
+ MultiCursor( const char *ns, const BSONObj &pattern, const BSONObj &order, shared_ptr<CursorOp> op = shared_ptr<CursorOp>(), bool mayYield = false );
+ /**
+ * Used
+ * 1. To handoff a query to a getMore()
+ * 2. To handoff a QueryOptimizerCursor
+ * @param nscanned is an optional initial value, if not supplied nscanned()
+ * will always return -1
+ */
+ MultiCursor( auto_ptr<MultiPlanScanner> mps, const shared_ptr<Cursor> &c, const shared_ptr<CoveredIndexMatcher> &matcher, const QueryOp &op, long long nscanned = -1 );
+
+ virtual bool ok() { return _c->ok(); }
+ virtual Record* _current() { return _c->_current(); }
+ virtual BSONObj current() { return _c->current(); }
+ virtual DiskLoc currLoc() { return _c->currLoc(); }
+ virtual bool advance() {
+ _c->advance();
+ while( !ok() && _mps->mayRunMore() ) {
+ nextClause();
+ }
+ return ok();
+ }
+ virtual BSONObj currKey() const { return _c->currKey(); }
+ virtual DiskLoc refLoc() { return _c->refLoc(); }
+ virtual void noteLocation() { _c->noteLocation(); }
+ virtual void checkLocation() { _c->checkLocation(); }
+ virtual bool supportGetMore() { return true; }
+ virtual bool supportYields() { return _c->supportYields(); }
+ virtual BSONObj indexKeyPattern() { return _c->indexKeyPattern(); }
+
+ /**
+ * with update we could potentially get the same document on multiple
+ * indexes, but update appears to already handle this with seenObjects
+ * so we don't have to do anything special here.
+ */
+ virtual bool getsetdup(DiskLoc loc) { return _c->getsetdup( loc ); }
+
+ virtual bool autoDedup() const { return _c->autoDedup(); }
+
+ virtual bool modifiedKeys() const { return _mps->modifiedKeys(); }
+
+ virtual bool isMultiKey() const { return _mps->hasMultiKey(); }
+
+ virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { return _matcher; }
+ virtual CoveredIndexMatcher* matcher() const { return _matcher.get(); }
+
+ virtual bool capped() const { return _c->capped(); }
+
+ /** return -1 if we're a getmore handoff */
+ virtual long long nscanned() { return _nscanned >= 0 ? _nscanned + _c->nscanned() : _nscanned; }
+ /** just for testing */
+ shared_ptr<Cursor> sub_c() const { return _c; }
+ private:
+ class NoOp : public CursorOp {
+ public:
+ NoOp() {}
+ NoOp( const QueryOp &other ) : CursorOp( other ) {}
+ virtual void _init() { setComplete(); }
+ virtual void next() {}
+ virtual bool mayRecordPlan() const { return false; }
+ virtual QueryOp *_createChild() const { return new NoOp(); }
+ virtual shared_ptr<Cursor> newCursor() const { return qp().newCursor(); }
+ virtual long long nscanned() { assert( false ); return 0; }
+ };
+ void nextClause();
+ shared_ptr<CursorOp> _op;
+ shared_ptr<Cursor> _c;
+ auto_ptr<MultiPlanScanner> _mps;
+ shared_ptr<CoveredIndexMatcher> _matcher;
+ long long _nscanned;
+ };
+
+ /** NOTE min, max, and keyPattern will be updated to be consistent with the selected index. */
+ IndexDetails *indexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern );
+
+ bool isSimpleIdQuery( const BSONObj& query );
+
+ /**
+ * @return a single cursor that may work well for the given query.
+ * It is possible no cursor is returned if the sort is not supported by an index. Clients are responsible
+ * for checking this if they are not sure an index for a sort exists, and defaulting to a non-sort if
+ * no suitable indices exist.
+ */
+ shared_ptr<Cursor> bestGuessCursor( const char *ns, const BSONObj &query, const BSONObj &sort );
+
+ /**
+ * Add-on functionality for queryutil classes requiring access to indexing
+ * functionality not currently linked to mongos.
+ * TODO Clean this up a bit, possibly with separate sharded and non sharded
+ * implementations for the appropriate queryutil classes or by pulling index
+ * related functionality into separate wrapper classes.
+ */
+ struct QueryUtilIndexed {
+ /** @return true if the index may be useful according to its KeySpec. */
+ static bool indexUseful( const FieldRangeSetPair &frsp, NamespaceDetails *d, int idxNo, const BSONObj &order );
+ /** Clear any indexes recorded as the best for either the single or multi key pattern. */
+ static void clearIndexesForPatterns( const FieldRangeSetPair &frsp, const BSONObj &order );
+ /** Return a recorded best index for the single or multi key pattern. */
+ static pair< BSONObj, long long > bestIndexForPatterns( const FieldRangeSetPair &frsp, const BSONObj &order );
+ static bool uselessOr( const OrRangeGenerator& org, NamespaceDetails *d, int hintIdx );
+ };
+
+} // namespace mongo
diff --git a/src/mongo/db/queryoptimizercursor.cpp b/src/mongo/db/queryoptimizercursor.cpp
new file mode 100644
index 00000000000..07f8df12815
--- /dev/null
+++ b/src/mongo/db/queryoptimizercursor.cpp
@@ -0,0 +1,530 @@
+// @file queryoptimizercursor.cpp
+
+/**
+ * Copyright (C) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "queryoptimizer.h"
+#include "pdfile.h"
+#include "clientcursor.h"
+#include "btree.h"
+#include "queryoptimizercursor.h"
+
+namespace mongo {
+
+ static const int OutOfOrderDocumentsAssertionCode = 14810;
+
+ /**
+ * A QueryOp implementation utilized by the QueryOptimizerCursor
+ */
+ class QueryOptimizerCursorOp : public QueryOp {
+ public:
+ /**
+ * @param aggregateNscanned - shared long long counting total nscanned for
+ * query ops for all cursors.
+ * @param requireIndex - if unindexed scans should be prohibited.
+ */
+ QueryOptimizerCursorOp( long long &aggregateNscanned, bool requireIndex, int cumulativeCount = 0 ) : _matchCounter( aggregateNscanned, cumulativeCount ), _countingMatches(), _mustAdvance(), _capped(), _yieldRecoveryFailed(), _requireIndex( requireIndex ) {}
+
+ virtual void _init() {
+ if ( qp().scanAndOrderRequired() ) {
+ throw MsgAssertionException( OutOfOrderDocumentsAssertionCode, "order spec cannot be satisfied with index" );
+ }
+ if ( _requireIndex && strcmp( qp().indexKey().firstElementFieldName(), "$natural" ) == 0 ) {
+ throw MsgAssertionException( 9011, "Not an index cursor" );
+ }
+ _c = qp().newCursor();
+
+ // The QueryOptimizerCursor::prepareToTouchEarlierIterate() implementation requires _c->prepareToYield() to work.
+ verify( 15940, _c->supportYields() );
+ _capped = _c->capped();
+
+ // TODO This violates the current Cursor interface abstraction, but for now it's simpler to keep our own set of
+ // dups rather than avoid poisoning the cursor's dup set with unreturned documents. Deduping documents
+ // matched in this QueryOptimizerCursorOp will run against the takeover cursor.
+ _matchCounter.setCheckDups( _c->isMultiKey() );
+
+ _matchCounter.updateNscanned( _c->nscanned() );
+ }
+
+ virtual long long nscanned() {
+ return _c ? _c->nscanned() : _matchCounter.nscanned();
+ }
+
+ virtual bool prepareToYield() {
+ if ( _c && !_cc ) {
+ _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , qp().ns() ) );
+ }
+ if ( _cc ) {
+ recordCursorLocation();
+ return _cc->prepareToYield( _yieldData );
+ }
+ // no active cursor - ok to yield
+ return true;
+ }
+
+ virtual void recoverFromYield() {
+ if ( _cc && !ClientCursor::recoverFromYield( _yieldData ) ) {
+ _yieldRecoveryFailed = true;
+ _c.reset();
+ _cc.reset();
+
+ if ( _capped ) {
+ msgassertedNoTrace( 13338, str::stream() << "capped cursor overrun: " << qp().ns() );
+ }
+ else if ( qp().mustAssertOnYieldFailure() ) {
+ msgassertedNoTrace( 15892, str::stream() << "QueryOptimizerCursorOp::recoverFromYield() failed to recover" );
+ }
+ else {
+ // we don't fail query since we're fine with returning partial data if collection dropped
+ // also, see SERVER-2454
+ }
+ }
+ else {
+ checkCursorAdvanced();
+ }
+ }
+
+ void prepareToTouchEarlierIterate() {
+ recordCursorLocation();
+ if ( _c ) {
+ _c->prepareToTouchEarlierIterate();
+ }
+ }
+
+ void recoverFromTouchingEarlierIterate() {
+ if ( _c ) {
+ _c->recoverFromTouchingEarlierIterate();
+ }
+ checkCursorAdvanced();
+ }
+
+ virtual void next() {
+ mayAdvance();
+
+ if ( _matchCounter.enoughCumulativeMatchesToChooseAPlan() ) {
+ setStop();
+ return;
+ }
+ if ( !_c || !_c->ok() ) {
+ setComplete();
+ return;
+ }
+
+ _mustAdvance = true;
+ }
+ virtual QueryOp *_createChild() const {
+ return new QueryOptimizerCursorOp( _matchCounter.aggregateNscanned(), _requireIndex, _matchCounter.cumulativeCount() );
+ }
+ DiskLoc currLoc() const { return _c ? _c->currLoc() : DiskLoc(); }
+ BSONObj currKey() const { return _c ? _c->currKey() : BSONObj(); }
+ bool currentMatches( MatchDetails *details ) {
+ bool ret = ( _c && _c->ok() ) ? matcher( _c.get() )->matchesCurrent( _c.get(), details ) : false;
+ // Cache the match, so we can count it in mayAdvance().
+ _matchCounter.setMatch( ret );
+ return ret;
+ }
+ virtual bool mayRecordPlan() const {
+ return !_yieldRecoveryFailed && complete() && ( !stopRequested() || _matchCounter.enoughMatchesToRecordPlan() );
+ }
+ shared_ptr<Cursor> cursor() const { return _c; }
+ private:
+ void mayAdvance() {
+ if ( !_c ) {
+ return;
+ }
+ if ( countingMatches() ) {
+ // Check match if not yet known.
+ if ( !_matchCounter.knowMatch() ) {
+ currentMatches( 0 );
+ }
+ _matchCounter.countMatch( currLoc() );
+ }
+ if ( _mustAdvance ) {
+ _c->advance();
+ handleCursorAdvanced();
+ }
+ _matchCounter.updateNscanned( _c->nscanned() );
+ }
+ // Don't count matches on the first call to next(), which occurs before the first result is returned.
+ bool countingMatches() {
+ if ( _countingMatches ) {
+ return true;
+ }
+ _countingMatches = true;
+ return false;
+ }
+
+ void recordCursorLocation() {
+ _posBeforeYield = currLoc();
+ }
+ void checkCursorAdvanced() {
+ // This check will not correctly determine if we are looking at a different document in
+ // all cases, but it is adequate for updating the query plan's match count (just used to pick
+ // plans, not returned to the client) and adjust iteration via _mustAdvance.
+ if ( _posBeforeYield != currLoc() ) {
+ // If the yield advanced our position, the next next() will be a no op.
+ handleCursorAdvanced();
+ }
+ }
+ void handleCursorAdvanced() {
+ _mustAdvance = false;
+ _matchCounter.resetMatch();
+ }
+
+ CachedMatchCounter _matchCounter;
+ bool _countingMatches;
+ bool _mustAdvance;
+ bool _capped;
+ shared_ptr<Cursor> _c;
+ ClientCursor::CleanupPointer _cc;
+ DiskLoc _posBeforeYield;
+ ClientCursor::YieldData _yieldData;
+ bool _yieldRecoveryFailed;
+ bool _requireIndex;
+ };
+
+ /**
+ * This cursor runs a MultiPlanScanner iteratively and returns results from
+ * the scanner's cursors as they become available. Once the scanner chooses
+ * a single plan, this cursor becomes a simple wrapper around that single
+ * plan's cursor (called the 'takeover' cursor).
+ */
+ class QueryOptimizerCursor : public Cursor {
+ public:
+ QueryOptimizerCursor( auto_ptr<MultiPlanScanner> &mps, bool requireIndex ) :
+ _mps( mps ),
+ _originalOp( new QueryOptimizerCursorOp( _nscanned, requireIndex ) ),
+ _currOp(),
+ _nscanned() {
+ _mps->initialOp( _originalOp );
+ shared_ptr<QueryOp> op = _mps->nextOp();
+ rethrowOnError( op );
+ if ( !op->complete() ) {
+ _currOp = dynamic_cast<QueryOptimizerCursorOp*>( op.get() );
+ }
+ }
+
+ virtual bool ok() { return _takeover ? _takeover->ok() : !currLoc().isNull(); }
+
+ virtual Record* _current() {
+ if ( _takeover ) {
+ return _takeover->_current();
+ }
+ assertOk();
+ return currLoc().rec();
+ }
+
+ virtual BSONObj current() {
+ if ( _takeover ) {
+ return _takeover->current();
+ }
+ assertOk();
+ return currLoc().obj();
+ }
+
+ virtual DiskLoc currLoc() { return _takeover ? _takeover->currLoc() : _currLoc(); }
+
+ DiskLoc _currLoc() const {
+ dassert( !_takeover );
+ return _currOp ? _currOp->currLoc() : DiskLoc();
+ }
+
+ virtual bool advance() {
+ return _advance( false );
+ }
+
+ virtual BSONObj currKey() const {
+ if ( _takeover ) {
+ return _takeover->currKey();
+ }
+ assertOk();
+ return _currOp->currKey();
+ }
+
+ /**
+ * When return value isNull(), our cursor will be ignored for yielding by the client cursor implementation.
+ * In such cases, an internal ClientCursor will update the position of component cursors when necessary.
+ */
+ virtual DiskLoc refLoc() { return _takeover ? _takeover->refLoc() : DiskLoc(); }
+
+ virtual BSONObj indexKeyPattern() {
+ if ( _takeover ) {
+ return _takeover->indexKeyPattern();
+ }
+ assertOk();
+ return _currOp->cursor()->indexKeyPattern();
+ }
+
+ virtual bool supportGetMore() { return false; }
+
+ virtual bool supportYields() { return _takeover ? _takeover->supportYields() : true; }
+
+ virtual void prepareToTouchEarlierIterate() {
+ if ( _takeover ) {
+ _takeover->prepareToTouchEarlierIterate();
+ }
+ else if ( _currOp ) {
+ if ( _mps->currentNPlans() == 1 ) {
+ // This single plan version is a bit more performant, so we use it when possible.
+ _currOp->prepareToTouchEarlierIterate();
+ }
+ else {
+ // With multiple plans, the 'earlier iterate' could be the current iterate of one of
+ // the component plans. We do a full yield of all plans, using ClientCursors.
+ verify( 15941, _mps->prepareToYield() );
+ }
+ }
+ }
+
+ virtual void recoverFromTouchingEarlierIterate() {
+ if ( _takeover ) {
+ _takeover->recoverFromTouchingEarlierIterate();
+ }
+ else if ( _currOp ) {
+ if ( _mps->currentNPlans() == 1 ) {
+ _currOp->recoverFromTouchingEarlierIterate();
+ }
+ else {
+ recoverFromYield();
+ }
+ }
+ }
+
+ virtual bool prepareToYield() {
+ if ( _takeover ) {
+ return _takeover->prepareToYield();
+ }
+ else if ( _currOp ) {
+ return _mps->prepareToYield();
+ }
+ else {
+ // No state needs to be protected, so yielding is fine.
+ return true;
+ }
+ }
+
+ virtual void recoverFromYield() {
+ if ( _takeover ) {
+ _takeover->recoverFromYield();
+ return;
+ }
+ if ( _currOp ) {
+ _mps->recoverFromYield();
+ if ( _currOp->error() || !ok() ) {
+ // Advance to a non error op if on of the ops errored out.
+ // Advance to a following $or clause if the $or clause returned all results.
+ _advance( true );
+ }
+ }
+ }
+
+ virtual string toString() { return "QueryOptimizerCursor"; }
+
+ virtual bool getsetdup(DiskLoc loc) {
+ if ( _takeover ) {
+ if ( getdupInternal( loc ) ) {
+ return true;
+ }
+ return _takeover->getsetdup( loc );
+ }
+ assertOk();
+ return getsetdupInternal( loc );
+ }
+
+ /** Matcher needs to know if the the cursor being forwarded to is multikey. */
+ virtual bool isMultiKey() const {
+ if ( _takeover ) {
+ return _takeover->isMultiKey();
+ }
+ assertOk();
+ return _currOp->cursor()->isMultiKey();
+ }
+
+ virtual bool modifiedKeys() const { return true; }
+
+ /** Initial capped wrapping cases (before takeover) are handled internally by a component ClientCursor. */
+ virtual bool capped() const { return _takeover ? _takeover->capped() : false; }
+
+ virtual long long nscanned() { return _takeover ? _takeover->nscanned() : _nscanned; }
+
+ virtual shared_ptr<CoveredIndexMatcher> matcherPtr() const {
+ if ( _takeover ) {
+ return _takeover->matcherPtr();
+ }
+ assertOk();
+ return _currOp->matcher( _currOp->cursor() );
+ }
+
+ virtual CoveredIndexMatcher* matcher() const {
+ if ( _takeover ) {
+ return _takeover->matcher();
+ }
+ assertOk();
+ return _currOp->matcher( _currOp->cursor() ).get();
+ }
+
+ virtual bool currentMatches( MatchDetails *details = 0 ) {
+ if ( _takeover ) {
+ return _takeover->currentMatches( details );
+ }
+ assertOk();
+ return _currOp->currentMatches( details );
+ }
+
+ private:
+ /**
+ * Advances the QueryPlanSet::Runner.
+ * @param force - advance even if the current query op is not valid. The 'force' param should only be specified
+ * when there are plans left in the runner.
+ */
+ bool _advance( bool force ) {
+ if ( _takeover ) {
+ return _takeover->advance();
+ }
+
+ if ( !force && !ok() ) {
+ return false;
+ }
+
+ DiskLoc prevLoc = _currLoc();
+
+ _currOp = 0;
+ shared_ptr<QueryOp> op = _mps->nextOp();
+ rethrowOnError( op );
+
+ // Avoiding dynamic_cast here for performance. Soon we won't need to
+ // do a cast at all.
+ QueryOptimizerCursorOp *qocop = (QueryOptimizerCursorOp*)( op.get() );
+
+ if ( !op->complete() ) {
+ // The 'qocop' will be valid until we call _mps->nextOp() again. We return 'current' values from this op.
+ _currOp = qocop;
+ }
+ else if ( op->stopRequested() ) {
+ if ( qocop->cursor() ) {
+ // Ensure that prepareToTouchEarlierIterate() may be called safely when a BasicCursor takes over.
+ if ( !prevLoc.isNull() && prevLoc == qocop->currLoc() ) {
+ qocop->cursor()->advance();
+ }
+ // Clear the Runner and any unnecessary QueryOps and their ClientCursors.
+ _mps->clearRunner();
+ _takeover.reset( new MultiCursor( _mps,
+ qocop->cursor(),
+ op->matcher( qocop->cursor() ),
+ *op,
+ _nscanned - qocop->cursor()->nscanned() ) );
+ }
+ }
+
+ return ok();
+ }
+ /** Forward an exception when the runner errs out. */
+ void rethrowOnError( const shared_ptr< QueryOp > &op ) {
+ if ( op->error() ) {
+ throw MsgAssertionException( op->exception() );
+ }
+ }
+
+ void assertOk() const {
+ massert( 14809, "Invalid access for cursor that is not ok()", !_currLoc().isNull() );
+ }
+
+ /** Insert and check for dups before takeover occurs */
+ bool getsetdupInternal(const DiskLoc &loc) {
+ return _dups.getsetdup( loc );
+ }
+
+ /** Just check for dups - after takeover occurs */
+ bool getdupInternal(const DiskLoc &loc) {
+ dassert( _takeover );
+ return _dups.getdup( loc );
+ }
+
+ auto_ptr<MultiPlanScanner> _mps;
+ shared_ptr<QueryOptimizerCursorOp> _originalOp;
+ QueryOptimizerCursorOp *_currOp;
+ shared_ptr<Cursor> _takeover;
+ long long _nscanned;
+ // Using a SmallDupSet seems a bit hokey, but I've measured a 5% performance improvement with ~100 document non multi key scans.
+ SmallDupSet _dups;
+ };
+
+ shared_ptr<Cursor> newQueryOptimizerCursor( auto_ptr<MultiPlanScanner> mps, bool requireIndex ) {
+ try {
+ return shared_ptr<Cursor>( new QueryOptimizerCursor( mps, requireIndex ) );
+ } catch( const AssertionException &e ) {
+ if ( e.getCode() == OutOfOrderDocumentsAssertionCode ) {
+ // If no indexes follow the requested sort order, return an
+ // empty pointer. This is legacy behavior based on bestGuessCursor().
+ return shared_ptr<Cursor>();
+ }
+ throw;
+ }
+ return shared_ptr<Cursor>();
+ }
+
+ shared_ptr<Cursor> NamespaceDetailsTransient::getCursor( const char *ns, const BSONObj &query,
+ const BSONObj &order, bool requireIndex,
+ bool *simpleEqualityMatch ) {
+ if ( simpleEqualityMatch ) {
+ *simpleEqualityMatch = false;
+ }
+ if ( query.isEmpty() && order.isEmpty() && !requireIndex ) {
+ // TODO This will not use a covered index currently.
+ return theDataFileMgr.findAll( ns );
+ }
+ if ( isSimpleIdQuery( query ) ) {
+ Database *database = cc().database();
+ verify( 15985, database );
+ NamespaceDetails *d = database->namespaceIndex.details(ns);
+ if ( d ) {
+ int idxNo = d->findIdIndex();
+ if ( idxNo >= 0 ) {
+ IndexDetails& i = d->idx( idxNo );
+ BSONObj key = i.getKeyFromQuery( query );
+ return shared_ptr<Cursor>( BtreeCursor::make( d, idxNo, i, key, key, true, 1 ) );
+ }
+ }
+ }
+ auto_ptr<MultiPlanScanner> mps( new MultiPlanScanner( ns, query, order ) ); // mayYield == false
+ shared_ptr<Cursor> single = mps->singleCursor();
+ if ( single ) {
+ if ( !( requireIndex &&
+ dynamic_cast<BasicCursor*>( single.get() ) /* May not use an unindexed cursor */ ) ) {
+ if ( !query.isEmpty() && !single->matcher() ) {
+ shared_ptr<CoveredIndexMatcher> matcher( new CoveredIndexMatcher( query, single->indexKeyPattern() ) );
+ single->setMatcher( matcher );
+ }
+ if ( simpleEqualityMatch ) {
+ const QueryPlan *qp = mps->singlePlan();
+ if ( qp->exactKeyMatch() && !single->matcher()->needRecord() ) {
+ *simpleEqualityMatch = true;
+ }
+ }
+ return single;
+ }
+ }
+ return newQueryOptimizerCursor( mps, requireIndex );
+ }
+
+ /** This interface just available for testing. */
+ shared_ptr<Cursor> newQueryOptimizerCursor( const char *ns, const BSONObj &query, const BSONObj &order, bool requireIndex ) {
+ auto_ptr<MultiPlanScanner> mps( new MultiPlanScanner( ns, query, order ) ); // mayYield == false
+ return newQueryOptimizerCursor( mps, requireIndex );
+ }
+
+} // namespace mongo;
diff --git a/src/mongo/db/queryoptimizercursor.h b/src/mongo/db/queryoptimizercursor.h
new file mode 100644
index 00000000000..ee5a1663370
--- /dev/null
+++ b/src/mongo/db/queryoptimizercursor.h
@@ -0,0 +1,150 @@
+// @file queryoptimizercursor.h
+
+/**
+ * Copyright (C) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+namespace mongo {
+
+ /** Helper class for caching and counting matches during execution of a QueryPlan. */
+ class CachedMatchCounter {
+ public:
+ /**
+ * @param aggregateNscanned - shared count of nscanned for this and othe plans.
+ * @param cumulativeCount - starting point for accumulated count over a series of plans.
+ */
+ CachedMatchCounter( long long &aggregateNscanned, int cumulativeCount ) : _aggregateNscanned( aggregateNscanned ), _nscanned(), _cumulativeCount( cumulativeCount ), _count(), _checkDups(), _match( Unknown ), _counted() {}
+
+ /** Set whether dup checking is enabled when counting. */
+ void setCheckDups( bool checkDups ) { _checkDups = checkDups; }
+
+ /**
+ * Usual sequence of events:
+ * 1) resetMatch() - reset stored match value to Unkonwn.
+ * 2) setMatch() - set match value to a definite true/false value.
+ * 3) knowMatch() - check if setMatch() has been called.
+ * 4) countMatch() - increment count if match is true.
+ */
+
+ void resetMatch() {
+ _match = Unknown;
+ _counted = false;
+ }
+ void setMatch( bool match ) { _match = match ? True : False; }
+ bool knowMatch() const { return _match != Unknown; }
+ void countMatch( const DiskLoc &loc ) {
+ if ( !_counted && _match == True && !getsetdup( loc ) ) {
+ ++_cumulativeCount;
+ ++_count;
+ _counted = true;
+ }
+ }
+
+ bool enoughCumulativeMatchesToChooseAPlan() const {
+ // This is equivalent to the default condition for switching from
+ // a query to a getMore, which was the historical default match count for
+ // choosing a plan.
+ return _cumulativeCount >= 101;
+ }
+ bool enoughMatchesToRecordPlan() const {
+ // Recording after 50 matches is a historical default (101 default limit / 2).
+ return _count > 50;
+ }
+
+ int cumulativeCount() const { return _cumulativeCount; }
+ int count() const { return _count; }
+
+ /** Update local and aggregate nscanned counts. */
+ void updateNscanned( long long nscanned ) {
+ _aggregateNscanned += ( nscanned - _nscanned );
+ _nscanned = nscanned;
+ }
+ long long nscanned() const { return _nscanned; }
+ long long &aggregateNscanned() const { return _aggregateNscanned; }
+ private:
+ bool getsetdup( const DiskLoc &loc ) {
+ if ( !_checkDups ) {
+ return false;
+ }
+ pair<set<DiskLoc>::iterator, bool> p = _dups.insert( loc );
+ return !p.second;
+ }
+ long long &_aggregateNscanned;
+ long long _nscanned;
+ int _cumulativeCount;
+ int _count;
+ bool _checkDups;
+ enum MatchState { Unknown, False, True };
+ MatchState _match;
+ bool _counted;
+ set<DiskLoc> _dups;
+ };
+
+ /** Dup tracking class, optimizing one common case with small set and few initial reads. */
+ class SmallDupSet {
+ public:
+ SmallDupSet() : _accesses() {
+ _vec.reserve( 250 );
+ }
+ /** @return true if @param 'loc' already added to the set, false if adding to the set in this call. */
+ bool getsetdup( const DiskLoc &loc ) {
+ access();
+ return vec() ? getsetdupVec( loc ) : getsetdupSet( loc );
+ }
+ /** @return true when @param loc in the set. */
+ bool getdup( const DiskLoc &loc ) {
+ access();
+ return vec() ? getdupVec( loc ) : getdupSet( loc );
+ }
+ private:
+ void access() {
+ ++_accesses;
+ mayUpgrade();
+ }
+ void mayUpgrade() {
+ if ( vec() && _accesses > 500 ) {
+ _set.insert( _vec.begin(), _vec.end() );
+ }
+ }
+ bool vec() const {
+ return _set.size() == 0;
+ }
+ bool getsetdupVec( const DiskLoc &loc ) {
+ if ( getdupVec( loc ) ) {
+ return true;
+ }
+ _vec.push_back( loc );
+ return false;
+ }
+ bool getdupVec( const DiskLoc &loc ) const {
+ for( vector<DiskLoc>::const_iterator i = _vec.begin(); i != _vec.end(); ++i ) {
+ if ( *i == loc ) {
+ return true;
+ }
+ }
+ return false;
+ }
+ bool getsetdupSet( const DiskLoc &loc ) {
+ pair<set<DiskLoc>::iterator, bool> p = _set.insert(loc);
+ return !p.second;
+ }
+ bool getdupSet( const DiskLoc &loc ) {
+ return _set.count( loc ) > 0;
+ }
+ vector<DiskLoc> _vec;
+ set<DiskLoc> _set;
+ long long _accesses;
+ };
+} // namespace mongo
diff --git a/src/mongo/db/querypattern.cpp b/src/mongo/db/querypattern.cpp
new file mode 100644
index 00000000000..e20e2b6a6ae
--- /dev/null
+++ b/src/mongo/db/querypattern.cpp
@@ -0,0 +1,99 @@
+// @file querypattern.cpp - Query pattern matching for selecting similar plans given similar queries.
+
+/* Copyright 2011 10gen Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "querypattern.h"
+
+namespace mongo {
+
+ QueryPattern::QueryPattern( const FieldRangeSet &frs, const BSONObj &sort ) {
+ for( map<string,FieldRange>::const_iterator i = frs.ranges().begin(); i != frs.ranges().end(); ++i ) {
+ if ( i->second.equality() ) {
+ _fieldTypes[ i->first ] = QueryPattern::Equality;
+ }
+ else if ( i->second.empty() ) {
+ // This case generally results from an upper and lower bound that are inconsistent for a single key index.
+ _fieldTypes[ i->first ] = QueryPattern::UpperAndLowerBound;
+ }
+ else if ( i->second.nontrivial() ) {
+ bool upper = i->second.max().type() != MaxKey;
+ bool lower = i->second.min().type() != MinKey;
+ if ( upper && lower )
+ _fieldTypes[ i->first ] = QueryPattern::UpperAndLowerBound;
+ else if ( upper )
+ _fieldTypes[ i->first ] = QueryPattern::UpperBound;
+ else if ( lower )
+ _fieldTypes[ i->first ] = QueryPattern::LowerBound;
+ }
+ }
+ setSort( sort );
+ }
+
+ /** for testing only - speed unimportant */
+ bool QueryPattern::operator==( const QueryPattern &other ) const {
+ bool less = operator<( other );
+ bool more = other.operator<( *this );
+ assert( !( less && more ) );
+ return !( less || more );
+ }
+
+ /** for testing only - speed unimportant */
+ bool QueryPattern::operator!=( const QueryPattern &other ) const {
+ return !operator==( other );
+ }
+
+ string typeToString( enum QueryPattern::Type t ) {
+ switch (t) {
+ case QueryPattern::Equality:
+ return "Equality";
+ case QueryPattern::LowerBound:
+ return "LowerBound";
+ case QueryPattern::UpperBound:
+ return "UpperBound";
+ case QueryPattern::UpperAndLowerBound:
+ return "UpperAndLowerBound";
+ }
+ return "";
+ }
+
+ string QueryPattern::toString() const {
+ BSONObjBuilder b;
+ for( map<string,Type>::const_iterator i = _fieldTypes.begin(); i != _fieldTypes.end(); ++i ) {
+ b << i->first << typeToString( i->second );
+ }
+ return BSON( "query" << b.done() << "sort" << _sort ).toString();
+ }
+
+ void QueryPattern::setSort( const BSONObj sort ) {
+ _sort = normalizeSort( sort );
+ }
+
+ BSONObj QueryPattern::normalizeSort( const BSONObj &spec ) {
+ if ( spec.isEmpty() )
+ return spec;
+ int direction = ( spec.firstElement().number() >= 0 ) ? 1 : -1;
+ BSONObjIterator i( spec );
+ BSONObjBuilder b;
+ while( i.moreWithEOO() ) {
+ BSONElement e = i.next();
+ if ( e.eoo() )
+ break;
+ b.append( e.fieldName(), direction * ( ( e.number() >= 0 ) ? -1 : 1 ) );
+ }
+ return b.obj();
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/querypattern.h b/src/mongo/db/querypattern.h
new file mode 100644
index 00000000000..000c301a0de
--- /dev/null
+++ b/src/mongo/db/querypattern.h
@@ -0,0 +1,78 @@
+// @file querypattern.h - Query pattern matching for selecting similar plans given similar queries.
+
+/* Copyright 2011 10gen Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "jsobj.h"
+#include "queryutil.h"
+
+namespace mongo {
+
+ /**
+ * Implements query pattern matching, used to determine if a query is
+ * similar to an earlier query and should use the same plan.
+ *
+ * Two queries will generate the same QueryPattern, and therefore match each
+ * other, if their fields have the same Types and they have the same sort
+ * spec.
+ */
+ class QueryPattern {
+ public:
+ QueryPattern( const FieldRangeSet &frs, const BSONObj &sort );
+ enum Type {
+ Equality,
+ LowerBound,
+ UpperBound,
+ UpperAndLowerBound
+ };
+ bool operator<( const QueryPattern &other ) const;
+ /** for testing only */
+ bool operator==( const QueryPattern &other ) const;
+ /** for testing only */
+ bool operator!=( const QueryPattern &other ) const;
+ /** for development / debugging */
+ string toString() const;
+ private:
+ void setSort( const BSONObj sort );
+ static BSONObj normalizeSort( const BSONObj &spec );
+ map<string,Type> _fieldTypes;
+ BSONObj _sort;
+ };
+
+ inline bool QueryPattern::operator<( const QueryPattern &other ) const {
+ map<string,Type>::const_iterator i = _fieldTypes.begin();
+ map<string,Type>::const_iterator j = other._fieldTypes.begin();
+ while( i != _fieldTypes.end() ) {
+ if ( j == other._fieldTypes.end() )
+ return false;
+ if ( i->first < j->first )
+ return true;
+ else if ( i->first > j->first )
+ return false;
+ if ( i->second < j->second )
+ return true;
+ else if ( i->second > j->second )
+ return false;
+ ++i;
+ ++j;
+ }
+ if ( j != other._fieldTypes.end() )
+ return true;
+ return _sort.woCompare( other._sort ) < 0;
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/queryutil-inl.h b/src/mongo/db/queryutil-inl.h
new file mode 100644
index 00000000000..08d3b1fac52
--- /dev/null
+++ b/src/mongo/db/queryutil-inl.h
@@ -0,0 +1,153 @@
+// @file queryutil-inl.h - Inline definitions for frequently called queryutil.h functions
+
+/* Copyright 2011 10gen Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace mongo {
+
+ inline bool FieldInterval::equality() const {
+ if ( _cachedEquality == -1 ) {
+ _cachedEquality = ( _lower._inclusive && _upper._inclusive && _lower._bound.woCompare( _upper._bound, false ) == 0 );
+ }
+ return _cachedEquality != 0;
+ }
+
+ inline bool FieldRange::equality() const {
+ return
+ !empty() &&
+ min().woCompare( max(), false ) == 0 &&
+ maxInclusive() &&
+ minInclusive();
+ }
+
+ inline bool FieldRange::inQuery() const {
+ if ( equality() ) {
+ return true;
+ }
+ for( vector<FieldInterval>::const_iterator i = _intervals.begin(); i != _intervals.end(); ++i ) {
+ if ( !i->equality() ) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * TODO Assumes intervals are contiguous and minKey/maxKey will not be
+ * matched against.
+ */
+ inline bool FieldRange::nontrivial() const {
+ return
+ ! empty() &&
+ ( _intervals.size() != 1 ||
+ minKey.firstElement().woCompare( min(), false ) != 0 ||
+ maxKey.firstElement().woCompare( max(), false ) != 0 );
+ }
+
+ inline const FieldRange &FieldRangeSet::range( const char *fieldName ) const {
+ map<string,FieldRange>::const_iterator f = _ranges.find( fieldName );
+ if ( f == _ranges.end() )
+ return trivialRange();
+ return f->second;
+ }
+
+ inline FieldRange &FieldRangeSet::range( const char *fieldName ) {
+ map<string,FieldRange>::iterator f = _ranges.find( fieldName );
+ if ( f == _ranges.end() ) {
+ _ranges.insert( make_pair( string( fieldName ), trivialRange() ) );
+ return _ranges.find( fieldName )->second;
+ }
+ return f->second;
+ }
+
+ inline int FieldRangeSet::nNontrivialRanges() const {
+ int count = 0;
+ for( map<string,FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+ if ( i->second.nontrivial() )
+ ++count;
+ }
+ return count;
+ }
+
+ inline bool FieldRangeSet::matchPossible() const {
+ for( map<string,FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+ if ( i->second.empty() ) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ inline bool FieldRangeSet::matchPossibleForIndex( const BSONObj &keyPattern ) const {
+ if ( !_singleKey ) {
+ return matchPossible();
+ }
+ BSONObjIterator i( keyPattern );
+ while( i.more() ) {
+ BSONElement e = i.next();
+ if ( e.fieldName() == string( "$natural" ) ) {
+ return true;
+ }
+ if ( range( e.fieldName() ).empty() ) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ inline long long FieldRangeVector::size() {
+ long long ret = 1;
+ for( vector<FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+ ret *= i->intervals().size();
+ }
+ return ret;
+ }
+
+ inline FieldRangeSetPair *OrRangeGenerator::topFrsp() const {
+ FieldRangeSetPair *ret = new FieldRangeSetPair( _baseSet );
+ if (_orSets.size()) {
+ *ret &= _orSets.front();
+ }
+ return ret;
+ }
+
+ inline FieldRangeSetPair *OrRangeGenerator::topFrspOriginal() const {
+ FieldRangeSetPair *ret = new FieldRangeSetPair( _baseSet );
+ if (_originalOrSets.size()) {
+ *ret &= _originalOrSets.front();
+ }
+ return ret;
+ }
+
+ inline bool FieldRangeSetPair::matchPossibleForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const {
+ assertValidIndexOrNoIndex( d, idxNo );
+ if ( !matchPossible() ) {
+ return false;
+ }
+ if ( idxNo < 0 ) {
+ // multi key matchPossible() is true, so return true.
+ return true;
+ }
+ return frsForIndex( d, idxNo ).matchPossibleForIndex( keyPattern );
+ }
+
+ inline void FieldRangeSetPair::assertValidIndexOrNoIndex( const NamespaceDetails *d, int idxNo ) const {
+ massert( 14049, "FieldRangeSetPair invalid index specified", idxNo >= -1 );
+ if ( idxNo >= 0 ) {
+ assertValidIndex( d, idxNo );
+ }
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/queryutil.cpp b/src/mongo/db/queryutil.cpp
new file mode 100644
index 00000000000..e6748c4bc2e
--- /dev/null
+++ b/src/mongo/db/queryutil.cpp
@@ -0,0 +1,1551 @@
+// @file queryutil.cpp
+
+/* Copyright 2009 10gen Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pch.h"
+
+#include "btree.h"
+#include "matcher.h"
+#include "pdfile.h"
+#include "queryoptimizer.h"
+#include "../util/unittest.h"
+#include "dbmessage.h"
+#include "indexkey.h"
+#include "../util/mongoutils/str.h"
+
+namespace mongo {
+ extern BSONObj staticNull;
+ extern BSONObj staticUndefined;
+
+ /** returns a string that when used as a matcher, would match a super set of regex()
+ returns "" for complex regular expressions
+ used to optimize queries in some simple regex cases that start with '^'
+
+ if purePrefix != NULL, sets it to whether the regex can be converted to a range query
+ */
+ string simpleRegex(const char* regex, const char* flags, bool* purePrefix) {
+ string r = "";
+
+ if (purePrefix) *purePrefix = false;
+
+ bool multilineOK;
+ if ( regex[0] == '\\' && regex[1] == 'A') {
+ multilineOK = true;
+ regex += 2;
+ }
+ else if (regex[0] == '^') {
+ multilineOK = false;
+ regex += 1;
+ }
+ else {
+ return r;
+ }
+
+ bool extended = false;
+ while (*flags) {
+ switch (*(flags++)) {
+ case 'm': // multiline
+ if (multilineOK)
+ continue;
+ else
+ return r;
+ case 'x': // extended
+ extended = true;
+ break;
+ default:
+ return r; // cant use index
+ }
+ }
+
+ stringstream ss;
+
+ while(*regex) {
+ char c = *(regex++);
+ if ( c == '*' || c == '?' ) {
+ // These are the only two symbols that make the last char optional
+ r = ss.str();
+ r = r.substr( 0 , r.size() - 1 );
+ return r; //breaking here fails with /^a?/
+ }
+ else if (c == '|') {
+ // whole match so far is optional. Nothing we can do here.
+ return string();
+ }
+ else if (c == '\\') {
+ c = *(regex++);
+ if (c == 'Q'){
+ // \Q...\E quotes everything inside
+ while (*regex) {
+ c = (*regex++);
+ if (c == '\\' && (*regex == 'E')){
+ regex++; //skip the 'E'
+ break; // go back to start of outer loop
+ }
+ else {
+ ss << c; // character should match itself
+ }
+ }
+ }
+ else if ((c >= 'A' && c <= 'Z') ||
+ (c >= 'a' && c <= 'z') ||
+ (c >= '0' && c <= '0') ||
+ (c == '\0')) {
+ // don't know what to do with these
+ r = ss.str();
+ break;
+ }
+ else {
+ // slash followed by non-alphanumeric represents the following char
+ ss << c;
+ }
+ }
+ else if (strchr("^$.[()+{", c)) {
+ // list of "metacharacters" from man pcrepattern
+ r = ss.str();
+ break;
+ }
+ else if (extended && c == '#') {
+ // comment
+ r = ss.str();
+ break;
+ }
+ else if (extended && isspace(c)) {
+ continue;
+ }
+ else {
+ // self-matching char
+ ss << c;
+ }
+ }
+
+ if ( r.empty() && *regex == 0 ) {
+ r = ss.str();
+ if (purePrefix) *purePrefix = !r.empty();
+ }
+
+ return r;
+ }
+ inline string simpleRegex(const BSONElement& e) {
+ switch(e.type()) {
+ case RegEx:
+ return simpleRegex(e.regex(), e.regexFlags());
+ case Object: {
+ BSONObj o = e.embeddedObject();
+ return simpleRegex(o["$regex"].valuestrsafe(), o["$options"].valuestrsafe());
+ }
+ default: assert(false); return ""; //return squashes compiler warning
+ }
+ }
+
+ string simpleRegexEnd( string regex ) {
+ ++regex[ regex.length() - 1 ];
+ return regex;
+ }
+
+
+ FieldRange::FieldRange( const BSONElement &e, bool singleKey, bool isNot, bool optimize )
+ : _singleKey( singleKey ) {
+ int op = e.getGtLtOp();
+
+ // NOTE with $not, we could potentially form a complementary set of intervals.
+ if ( !isNot && !e.eoo() && e.type() != RegEx && op == BSONObj::opIN ) {
+ set<BSONElement,element_lt> vals;
+ vector<FieldRange> regexes;
+ uassert( 12580 , "invalid query" , e.isABSONObj() );
+ BSONObjIterator i( e.embeddedObject() );
+ while( i.more() ) {
+ BSONElement ie = i.next();
+ uassert( 15881, "$elemMatch not allowed within $in",
+ ie.type() != Object ||
+ ie.embeddedObject().firstElement().getGtLtOp() != BSONObj::opELEM_MATCH );
+ if ( ie.type() == RegEx ) {
+ regexes.push_back( FieldRange( ie, singleKey, false, optimize ) );
+ }
+ else {
+ // A document array may be indexed by its first element, by undefined
+ // if it is empty, or as a full array if it is embedded within another
+ // array.
+ vals.insert( ie );
+ if ( ie.type() == Array ) {
+ BSONElement temp = ie.embeddedObject().firstElement();
+ if ( temp.eoo() ) {
+ temp = staticUndefined.firstElement();
+ }
+ vals.insert( temp );
+ }
+ }
+ }
+
+ for( set<BSONElement,element_lt>::const_iterator i = vals.begin(); i != vals.end(); ++i )
+ _intervals.push_back( FieldInterval(*i) );
+
+ for( vector<FieldRange>::const_iterator i = regexes.begin(); i != regexes.end(); ++i )
+ *this |= *i;
+
+ return;
+ }
+
+ // A document array may be indexed by its first element, by undefined
+ // if it is empty, or as a full array if it is embedded within another
+ // array.
+ if ( e.type() == Array && op == BSONObj::Equality ) {
+
+ _intervals.push_back( FieldInterval(e) );
+ BSONElement temp = e.embeddedObject().firstElement();
+ if ( temp.eoo() ) {
+ temp = staticUndefined.firstElement();
+ }
+ if ( temp < e ) {
+ _intervals.insert( _intervals.begin() , temp );
+ }
+ else {
+ _intervals.push_back( FieldInterval(temp) );
+ }
+
+ return;
+ }
+
+ _intervals.push_back( FieldInterval() );
+ FieldInterval &initial = _intervals[ 0 ];
+ BSONElement &lower = initial._lower._bound;
+ bool &lowerInclusive = initial._lower._inclusive;
+ BSONElement &upper = initial._upper._bound;
+ bool &upperInclusive = initial._upper._inclusive;
+ lower = minKey.firstElement();
+ lowerInclusive = true;
+ upper = maxKey.firstElement();
+ upperInclusive = true;
+
+ if ( e.eoo() )
+ return;
+
+ bool existsSpec = false;
+ if ( op == BSONObj::opEXISTS ) {
+ existsSpec = e.trueValue();
+ }
+
+ if ( e.type() == RegEx
+ || (e.type() == Object && !e.embeddedObject()["$regex"].eoo())
+ ) {
+ uassert( 13454, "invalid regular expression operator", op == BSONObj::Equality || op == BSONObj::opREGEX );
+ if ( !isNot ) { // no optimization for negated regex - we could consider creating 2 intervals comprising all nonmatching prefixes
+ const string r = simpleRegex(e);
+ if ( r.size() ) {
+ lower = addObj( BSON( "" << r ) ).firstElement();
+ upper = addObj( BSON( "" << simpleRegexEnd( r ) ) ).firstElement();
+ upperInclusive = false;
+ }
+ else {
+ BSONObjBuilder b1(32), b2(32);
+ b1.appendMinForType( "" , String );
+ lower = addObj( b1.obj() ).firstElement();
+
+ b2.appendMaxForType( "" , String );
+ upper = addObj( b2.obj() ).firstElement();
+ upperInclusive = false; //MaxForType String is an empty Object
+ }
+
+ // regex matches self - regex type > string type
+ if (e.type() == RegEx) {
+ BSONElement re = addObj( BSON( "" << e ) ).firstElement();
+ _intervals.push_back( FieldInterval(re) );
+ }
+ else {
+ BSONObj orig = e.embeddedObject();
+ BSONObjBuilder b;
+ b.appendRegex("", orig["$regex"].valuestrsafe(), orig["$options"].valuestrsafe());
+ BSONElement re = addObj( b.obj() ).firstElement();
+ _intervals.push_back( FieldInterval(re) );
+ }
+
+ }
+ return;
+ }
+ if ( isNot ) {
+ switch( op ) {
+ case BSONObj::Equality:
+ return;
+// op = BSONObj::NE;
+// break;
+ case BSONObj::opALL:
+ case BSONObj::opMOD: // NOTE for mod and type, we could consider having 1-2 intervals comprising the complementary types (multiple intervals already possible with $in)
+ case BSONObj::opTYPE:
+ // no bound calculation
+ return;
+ case BSONObj::NE:
+ op = BSONObj::Equality;
+ break;
+ case BSONObj::LT:
+ op = BSONObj::GTE;
+ break;
+ case BSONObj::LTE:
+ op = BSONObj::GT;
+ break;
+ case BSONObj::GT:
+ op = BSONObj::LTE;
+ break;
+ case BSONObj::GTE:
+ op = BSONObj::LT;
+ break;
+ case BSONObj::opEXISTS:
+ existsSpec = !existsSpec;
+ break;
+ default: // otherwise doesn't matter
+ break;
+ }
+ }
+ switch( op ) {
+ case BSONObj::Equality:
+ lower = upper = e;
+ break;
+ case BSONObj::NE: {
+ // this will invalidate the upper/lower references above
+ _intervals.push_back( FieldInterval() );
+ // optimize doesn't make sense for negative ranges
+ _intervals[ 0 ]._upper._bound = e;
+ _intervals[ 0 ]._upper._inclusive = false;
+ _intervals[ 1 ]._lower._bound = e;
+ _intervals[ 1 ]._lower._inclusive = false;
+ _intervals[ 1 ]._upper._bound = maxKey.firstElement();
+ _intervals[ 1 ]._upper._inclusive = true;
+ optimize = false; // don't run optimize code below
+ break;
+ }
+ case BSONObj::LT:
+ upperInclusive = false;
+ case BSONObj::LTE:
+ upper = e;
+ break;
+ case BSONObj::GT:
+ lowerInclusive = false;
+ case BSONObj::GTE:
+ lower = e;
+ break;
+ case BSONObj::opALL: {
+ uassert( 10370 , "$all requires array", e.type() == Array );
+ BSONObjIterator i( e.embeddedObject() );
+ bool bound = false;
+ while ( i.more() ) {
+ BSONElement x = i.next();
+ if ( x.type() == Object && x.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ) {
+ // taken care of elsewhere
+ }
+ else if ( x.type() != RegEx ) {
+ lower = upper = x;
+ bound = true;
+ break;
+ }
+ }
+ if ( !bound ) { // if no good non regex bound found, try regex bounds
+ BSONObjIterator i( e.embeddedObject() );
+ while( i.more() ) {
+ BSONElement x = i.next();
+ if ( x.type() != RegEx )
+ continue;
+ string simple = simpleRegex( x.regex(), x.regexFlags() );
+ if ( !simple.empty() ) {
+ lower = addObj( BSON( "" << simple ) ).firstElement();
+ upper = addObj( BSON( "" << simpleRegexEnd( simple ) ) ).firstElement();
+ break;
+ }
+ }
+ }
+ break;
+ }
+ case BSONObj::opMOD: {
+ {
+ BSONObjBuilder b;
+ b.appendMinForType( "" , NumberDouble );
+ lower = addObj( b.obj() ).firstElement();
+ }
+ {
+ BSONObjBuilder b;
+ b.appendMaxForType( "" , NumberDouble );
+ upper = addObj( b.obj() ).firstElement();
+ }
+ break;
+ }
+ case BSONObj::opTYPE: {
+ BSONType t = (BSONType)e.numberInt();
+ {
+ BSONObjBuilder b;
+ b.appendMinForType( "" , t );
+ lower = addObj( b.obj() ).firstElement();
+ }
+ {
+ BSONObjBuilder b;
+ b.appendMaxForType( "" , t );
+ upper = addObj( b.obj() ).firstElement();
+ }
+
+ break;
+ }
+ case BSONObj::opREGEX:
+ case BSONObj::opOPTIONS:
+ // do nothing
+ break;
+ case BSONObj::opELEM_MATCH: {
+ log() << "warning: shouldn't get here?" << endl;
+ break;
+ }
+ case BSONObj::opNEAR:
+ case BSONObj::opWITHIN:
+ _special = "2d";
+ break;
+ case BSONObj::opEXISTS: {
+ if ( !existsSpec ) {
+ lower = upper = staticNull.firstElement();
+ }
+ optimize = false;
+ break;
+ }
+ default:
+ break;
+ }
+
+ if ( optimize ) {
+ if ( lower.type() != MinKey && upper.type() == MaxKey && lower.isSimpleType() ) { // TODO: get rid of isSimpleType
+ BSONObjBuilder b;
+ b.appendMaxForType( lower.fieldName() , lower.type() );
+ upper = addObj( b.obj() ).firstElement();
+ }
+ else if ( lower.type() == MinKey && upper.type() != MaxKey && upper.isSimpleType() ) { // TODO: get rid of isSimpleType
+ if( upper.type() == Date )
+ lowerInclusive = false;
+ BSONObjBuilder b;
+ b.appendMinForType( upper.fieldName() , upper.type() );
+ lower = addObj( b.obj() ).firstElement();
+ }
+ }
+
+ }
+
+ void FieldRange::finishOperation( const vector<FieldInterval> &newIntervals, const FieldRange &other ) {
+ _intervals = newIntervals;
+ for( vector<BSONObj>::const_iterator i = other._objData.begin(); i != other._objData.end(); ++i )
+ _objData.push_back( *i );
+ if ( _special.size() == 0 && other._special.size() )
+ _special = other._special;
+ }
+
+ // as called, these functions find the max/min of a bound in the
+ // opposite direction, so inclusive bounds are considered less
+ // superlative
+ FieldBound maxFieldBound( const FieldBound &a, const FieldBound &b ) {
+ int cmp = a._bound.woCompare( b._bound, false );
+ if ( ( cmp == 0 && !b._inclusive ) || cmp < 0 )
+ return b;
+ return a;
+ }
+
+ FieldBound minFieldBound( const FieldBound &a, const FieldBound &b ) {
+ int cmp = a._bound.woCompare( b._bound, false );
+ if ( ( cmp == 0 && !b._inclusive ) || cmp > 0 )
+ return b;
+ return a;
+ }
+
+ bool fieldIntervalOverlap( const FieldInterval &one, const FieldInterval &two, FieldInterval &result ) {
+ result._lower = maxFieldBound( one._lower, two._lower );
+ result._upper = minFieldBound( one._upper, two._upper );
+ return result.strictValid();
+ }
+
+ const FieldRange &FieldRange::operator&=( const FieldRange &other ) {
+ if ( !_singleKey && nontrivial() ) {
+ if ( other <= *this ) {
+ *this = other;
+ }
+ return *this;
+ }
+ vector<FieldInterval> newIntervals;
+ vector<FieldInterval>::const_iterator i = _intervals.begin();
+ vector<FieldInterval>::const_iterator j = other._intervals.begin();
+ while( i != _intervals.end() && j != other._intervals.end() ) {
+ FieldInterval overlap;
+ if ( fieldIntervalOverlap( *i, *j, overlap ) ) {
+ newIntervals.push_back( overlap );
+ }
+ if ( i->_upper == minFieldBound( i->_upper, j->_upper ) ) {
+ ++i;
+ }
+ else {
+ ++j;
+ }
+ }
+ finishOperation( newIntervals, other );
+ return *this;
+ }
+
+ void handleInterval( const FieldInterval &lower, FieldBound &low, FieldBound &high, vector<FieldInterval> &newIntervals ) {
+ if ( low._bound.eoo() ) {
+ low = lower._lower; high = lower._upper;
+ }
+ else {
+ int cmp = high._bound.woCompare( lower._lower._bound, false );
+ if ( ( cmp < 0 ) || ( cmp == 0 && !high._inclusive && !lower._lower._inclusive ) ) {
+ FieldInterval tmp;
+ tmp._lower = low;
+ tmp._upper = high;
+ newIntervals.push_back( tmp );
+ low = lower._lower; high = lower._upper;
+ }
+ else {
+ high = lower._upper;
+ }
+ }
+ }
+
+ const FieldRange &FieldRange::operator|=( const FieldRange &other ) {
+ vector<FieldInterval> newIntervals;
+ FieldBound low;
+ FieldBound high;
+ vector<FieldInterval>::const_iterator i = _intervals.begin();
+ vector<FieldInterval>::const_iterator j = other._intervals.begin();
+ while( i != _intervals.end() && j != other._intervals.end() ) {
+ int cmp = i->_lower._bound.woCompare( j->_lower._bound, false );
+ if ( ( cmp == 0 && i->_lower._inclusive ) || cmp < 0 ) {
+ handleInterval( *i, low, high, newIntervals );
+ ++i;
+ }
+ else {
+ handleInterval( *j, low, high, newIntervals );
+ ++j;
+ }
+ }
+ while( i != _intervals.end() ) {
+ handleInterval( *i, low, high, newIntervals );
+ ++i;
+ }
+ while( j != other._intervals.end() ) {
+ handleInterval( *j, low, high, newIntervals );
+ ++j;
+ }
+ FieldInterval tmp;
+ tmp._lower = low;
+ tmp._upper = high;
+ newIntervals.push_back( tmp );
+ finishOperation( newIntervals, other );
+ return *this;
+ }
+
+ const FieldRange &FieldRange::operator-=( const FieldRange &other ) {
+ vector<FieldInterval> newIntervals;
+ vector<FieldInterval>::iterator i = _intervals.begin();
+ vector<FieldInterval>::const_iterator j = other._intervals.begin();
+ while( i != _intervals.end() && j != other._intervals.end() ) {
+ int cmp = i->_lower._bound.woCompare( j->_lower._bound, false );
+ if ( cmp < 0 ||
+ ( cmp == 0 && i->_lower._inclusive && !j->_lower._inclusive ) ) {
+ int cmp2 = i->_upper._bound.woCompare( j->_lower._bound, false );
+ if ( cmp2 < 0 ) {
+ newIntervals.push_back( *i );
+ ++i;
+ }
+ else if ( cmp2 == 0 ) {
+ newIntervals.push_back( *i );
+ if ( newIntervals.back()._upper._inclusive && j->_lower._inclusive ) {
+ newIntervals.back()._upper._inclusive = false;
+ }
+ ++i;
+ }
+ else {
+ newIntervals.push_back( *i );
+ newIntervals.back()._upper = j->_lower;
+ newIntervals.back()._upper.flipInclusive();
+ int cmp3 = i->_upper._bound.woCompare( j->_upper._bound, false );
+ if ( cmp3 < 0 ||
+ ( cmp3 == 0 && ( !i->_upper._inclusive || j->_upper._inclusive ) ) ) {
+ ++i;
+ }
+ else {
+ i->_lower = j->_upper;
+ i->_lower.flipInclusive();
+ ++j;
+ }
+ }
+ }
+ else {
+ int cmp2 = i->_lower._bound.woCompare( j->_upper._bound, false );
+ if ( cmp2 > 0 ||
+ ( cmp2 == 0 && ( !i->_lower._inclusive || !j->_upper._inclusive ) ) ) {
+ ++j;
+ }
+ else {
+ int cmp3 = i->_upper._bound.woCompare( j->_upper._bound, false );
+ if ( cmp3 < 0 ||
+ ( cmp3 == 0 && ( !i->_upper._inclusive || j->_upper._inclusive ) ) ) {
+ ++i;
+ }
+ else {
+ i->_lower = j->_upper;
+ i->_lower.flipInclusive();
+ ++j;
+ }
+ }
+ }
+ }
+ while( i != _intervals.end() ) {
+ newIntervals.push_back( *i );
+ ++i;
+ }
+ finishOperation( newIntervals, other );
+ return *this;
+ }
+
+ // TODO write a proper implementation that doesn't do a full copy
+ bool FieldRange::operator<=( const FieldRange &other ) const {
+ FieldRange temp = *this;
+ temp -= other;
+ return temp.empty();
+ }
+
+ void FieldRange::setExclusiveBounds() {
+ for( vector<FieldInterval>::iterator i = _intervals.begin(); i != _intervals.end(); ++i ) {
+ i->_lower._inclusive = false;
+ i->_upper._inclusive = false;
+ }
+ }
+
+ void FieldRange::reverse( FieldRange &ret ) const {
+ assert( _special.empty() );
+ ret._intervals.clear();
+ ret._objData = _objData;
+ for( vector<FieldInterval>::const_reverse_iterator i = _intervals.rbegin(); i != _intervals.rend(); ++i ) {
+ FieldInterval fi;
+ fi._lower = i->_upper;
+ fi._upper = i->_lower;
+ ret._intervals.push_back( fi );
+ }
+ }
+
+ BSONObj FieldRange::addObj( const BSONObj &o ) {
+ _objData.push_back( o );
+ return o;
+ }
+
+ string FieldInterval::toString() const {
+ StringBuilder buf;
+ buf << ( _lower._inclusive ? "[" : "(" );
+ buf << _lower._bound;
+ buf << " , ";
+ buf << _upper._bound;
+ buf << ( _upper._inclusive ? "]" : ")" );
+ return buf.str();
+ }
+
+ string FieldRange::toString() const {
+ StringBuilder buf;
+ buf << "(FieldRange special: " << _special << " singleKey: " << _special << " intervals: ";
+ for( vector<FieldInterval>::const_iterator i = _intervals.begin(); i != _intervals.end(); ++i ) {
+ buf << i->toString();
+ }
+
+ buf << ")";
+ return buf.str();
+ }
+
+ string FieldRangeSet::getSpecial() const {
+ string s = "";
+ for ( map<string,FieldRange>::const_iterator i=_ranges.begin(); i!=_ranges.end(); i++ ) {
+ if ( i->second.getSpecial().size() == 0 )
+ continue;
+ uassert( 13033 , "can't have 2 special fields" , s.size() == 0 );
+ s = i->second.getSpecial();
+ }
+ return s;
+ }
+
+ /**
+ * Btree scanning for a multidimentional key range will yield a
+ * multidimensional box. The idea here is that if an 'other'
+ * multidimensional box contains the current box we don't have to scan
+ * the current box. If the 'other' box contains the current box in
+ * all dimensions but one, we can safely subtract the values of 'other'
+ * along that one dimension from the values for the current box on the
+ * same dimension. In other situations, subtracting the 'other'
+ * box from the current box yields a result that is not a box (but
+ * rather can be expressed as a union of boxes). We don't support
+ * such splitting currently in calculating index ranges. Note that
+ * where I have said 'box' above, I actually mean sets of boxes because
+ * a field range can consist of multiple intervals.
+ */
+ const FieldRangeSet &FieldRangeSet::operator-=( const FieldRangeSet &other ) {
+ int nUnincluded = 0;
+ string unincludedKey;
+ map<string,FieldRange>::iterator i = _ranges.begin();
+ map<string,FieldRange>::const_iterator j = other._ranges.begin();
+ while( nUnincluded < 2 && i != _ranges.end() && j != other._ranges.end() ) {
+ int cmp = i->first.compare( j->first );
+ if ( cmp == 0 ) {
+ if ( i->second <= j->second ) {
+ // nothing
+ }
+ else {
+ ++nUnincluded;
+ unincludedKey = i->first;
+ }
+ ++i;
+ ++j;
+ }
+ else if ( cmp < 0 ) {
+ ++i;
+ }
+ else {
+ // other has a bound we don't, nothing can be done
+ return *this;
+ }
+ }
+ if ( j != other._ranges.end() ) {
+ // other has a bound we don't, nothing can be done
+ return *this;
+ }
+ if ( nUnincluded > 1 ) {
+ return *this;
+ }
+ if ( nUnincluded == 0 ) {
+ makeEmpty();
+ return *this;
+ }
+ // nUnincluded == 1
+ range( unincludedKey.c_str() ) -= other.range( unincludedKey.c_str() );
+ appendQueries( other );
+ return *this;
+ }
+
+ const FieldRangeSet &FieldRangeSet::operator&=( const FieldRangeSet &other ) {
+ map<string,FieldRange>::iterator i = _ranges.begin();
+ map<string,FieldRange>::const_iterator j = other._ranges.begin();
+ while( i != _ranges.end() && j != other._ranges.end() ) {
+ int cmp = i->first.compare( j->first );
+ if ( cmp == 0 ) {
+ // Same field name, so find range intersection.
+ i->second &= j->second;
+ ++i;
+ ++j;
+ }
+ else if ( cmp < 0 ) {
+ // Field present in *this.
+ ++i;
+ }
+ else {
+ // Field not present in *this, so add it.
+ range( j->first.c_str() ) = j->second;
+ ++j;
+ }
+ }
+ while( j != other._ranges.end() ) {
+ // Field not present in *this, add it.
+ range( j->first.c_str() ) = j->second;
+ ++j;
+ }
+ appendQueries( other );
+ return *this;
+ }
+
+ void FieldRangeSet::appendQueries( const FieldRangeSet &other ) {
+ for( vector<BSONObj>::const_iterator i = other._queries.begin(); i != other._queries.end(); ++i ) {
+ _queries.push_back( *i );
+ }
+ }
+
+ void FieldRangeSet::makeEmpty() {
+ for( map<string,FieldRange>::iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+ i->second.makeEmpty();
+ }
+ }
+
+ void FieldRangeSet::processOpElement( const char *fieldName, const BSONElement &f, bool isNot, bool optimize ) {
+ BSONElement g = f;
+ int op2 = g.getGtLtOp();
+ if ( op2 == BSONObj::opALL ) {
+ BSONElement h = g;
+ uassert( 13050 , "$all requires array", h.type() == Array );
+ BSONObjIterator i( h.embeddedObject() );
+ if( i.more() ) {
+ BSONElement x = i.next();
+ if ( x.type() == Object && x.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ) {
+ g = x.embeddedObject().firstElement();
+ op2 = g.getGtLtOp();
+ }
+ }
+ }
+ if ( op2 == BSONObj::opELEM_MATCH ) {
+ BSONObjIterator k( g.embeddedObjectUserCheck() );
+ while ( k.more() ) {
+ BSONElement h = k.next();
+ StringBuilder buf(32);
+ buf << fieldName << "." << h.fieldName();
+ string fullname = buf.str();
+
+ int op3 = getGtLtOp( h );
+ if ( op3 == BSONObj::Equality ) {
+ range( fullname.c_str() ) &= FieldRange( h , _singleKey , isNot , optimize );
+ }
+ else {
+ BSONObjIterator l( h.embeddedObject() );
+ while ( l.more() ) {
+ range( fullname.c_str() ) &= FieldRange( l.next() , _singleKey , isNot , optimize );
+ }
+ }
+ }
+ }
+ else {
+ range( fieldName ) &= FieldRange( f , _singleKey , isNot , optimize );
+ }
+ }
+
+ void FieldRangeSet::processQueryField( const BSONElement &e, bool optimize ) {
+ if ( e.fieldName()[ 0 ] == '$' ) {
+ if ( strcmp( e.fieldName(), "$and" ) == 0 ) {
+ uassert( 14816 , "$and expression must be a nonempty array" , e.type() == Array && e.embeddedObject().nFields() > 0 );
+ BSONObjIterator i( e.embeddedObject() );
+ while( i.more() ) {
+ BSONElement e = i.next();
+ uassert( 14817 , "$and elements must be objects" , e.type() == Object );
+ BSONObjIterator j( e.embeddedObject() );
+ while( j.more() ) {
+ processQueryField( j.next(), optimize );
+ }
+ }
+ }
+
+ if ( strcmp( e.fieldName(), "$where" ) == 0 ) {
+ return;
+ }
+
+ if ( strcmp( e.fieldName(), "$or" ) == 0 ) {
+ return;
+ }
+
+ if ( strcmp( e.fieldName(), "$nor" ) == 0 ) {
+ return;
+ }
+ }
+
+ bool equality = ( getGtLtOp( e ) == BSONObj::Equality );
+ if ( equality && e.type() == Object ) {
+ equality = ( strcmp( e.embeddedObject().firstElementFieldName(), "$not" ) != 0 );
+ }
+
+ if ( equality || ( e.type() == Object && !e.embeddedObject()[ "$regex" ].eoo() ) ) {
+ range( e.fieldName() ) &= FieldRange( e , _singleKey , false , optimize );
+ }
+ if ( !equality ) {
+ BSONObjIterator j( e.embeddedObject() );
+ while( j.more() ) {
+ BSONElement f = j.next();
+ if ( strcmp( f.fieldName(), "$not" ) == 0 ) {
+ switch( f.type() ) {
+ case Object: {
+ BSONObjIterator k( f.embeddedObject() );
+ while( k.more() ) {
+ BSONElement g = k.next();
+ uassert( 13034, "invalid use of $not", g.getGtLtOp() != BSONObj::Equality );
+ processOpElement( e.fieldName(), g, true, optimize );
+ }
+ break;
+ }
+ case RegEx:
+ processOpElement( e.fieldName(), f, true, optimize );
+ break;
+ default:
+ uassert( 13041, "invalid use of $not", false );
+ }
+ }
+ else {
+ processOpElement( e.fieldName(), f, false, optimize );
+ }
+ }
+ }
+ }
+
+ FieldRangeSet::FieldRangeSet( const char *ns, const BSONObj &query, bool singleKey, bool optimize )
+ : _ns( ns ), _queries( 1, query.getOwned() ), _singleKey( singleKey ) {
+ BSONObjIterator i( _queries[ 0 ] );
+
+ while( i.more() ) {
+ processQueryField( i.next(), optimize );
+ }
+ }
+
+ FieldRangeVector::FieldRangeVector( const FieldRangeSet &frs, const IndexSpec &indexSpec, int direction )
+ :_indexSpec( indexSpec ), _direction( direction >= 0 ? 1 : -1 ) {
+ _queries = frs._queries;
+ BSONObjIterator i( _indexSpec.keyPattern );
+ set< string > baseObjectNontrivialPrefixes;
+ while( i.more() ) {
+ BSONElement e = i.next();
+ const FieldRange *range = &frs.range( e.fieldName() );
+ if ( !frs.singleKey() ) {
+ string prefix = str::before( e.fieldName(), '.' );
+ if ( baseObjectNontrivialPrefixes.count( prefix ) > 0 ) {
+ // A field with the same parent field has already been
+ // constrainted, and with a multikey index we cannot
+ // constrain this field.
+ range = &frs.trivialRange();
+ } else {
+ if ( range->nontrivial() ) {
+ baseObjectNontrivialPrefixes.insert( prefix );
+ }
+ }
+ }
+ int number = (int) e.number(); // returns 0.0 if not numeric
+ bool forward = ( ( number >= 0 ? 1 : -1 ) * ( direction >= 0 ? 1 : -1 ) > 0 );
+ if ( forward ) {
+ _ranges.push_back( *range );
+ }
+ else {
+ _ranges.push_back( FieldRange( BSONObj().firstElement(), frs.singleKey(), false, true ) );
+ range->reverse( _ranges.back() );
+ }
+ assert( !_ranges.back().empty() );
+ }
+ uassert( 13385, "combinatorial limit of $in partitioning of result set exceeded", size() < 1000000 );
+ }
+
+ BSONObj FieldRangeVector::startKey() const {
+ BSONObjBuilder b;
+ for( vector<FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+ const FieldInterval &fi = i->intervals().front();
+ b.appendAs( fi._lower._bound, "" );
+ }
+ return b.obj();
+ }
+
+ BSONObj FieldRangeVector::endKey() const {
+ BSONObjBuilder b;
+ for( vector<FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+ const FieldInterval &fi = i->intervals().back();
+ b.appendAs( fi._upper._bound, "" );
+ }
+ return b.obj();
+ }
+
+ BSONObj FieldRangeVector::obj() const {
+ BSONObjBuilder b;
+ BSONObjIterator k( _indexSpec.keyPattern );
+ for( int i = 0; i < (int)_ranges.size(); ++i ) {
+ BSONArrayBuilder a( b.subarrayStart( k.next().fieldName() ) );
+ for( vector<FieldInterval>::const_iterator j = _ranges[ i ].intervals().begin();
+ j != _ranges[ i ].intervals().end(); ++j ) {
+ a << BSONArray( BSON_ARRAY( j->_lower._bound << j->_upper._bound ).clientReadable() );
+ }
+ a.done();
+ }
+ return b.obj();
+ }
+
+ FieldRange *FieldRangeSet::__singleKeyTrivialRange = 0;
+ FieldRange *FieldRangeSet::__multiKeyTrivialRange = 0;
+ const FieldRange &FieldRangeSet::trivialRange() const {
+ FieldRange *&ret = _singleKey ? __singleKeyTrivialRange : __multiKeyTrivialRange;
+ if ( ret == 0 ) {
+ ret = new FieldRange( BSONObj().firstElement(), _singleKey, false, true );
+ }
+ return *ret;
+ }
+
+ BSONObj FieldRangeSet::simplifiedQuery( const BSONObj &_fields ) const {
+ BSONObj fields = _fields;
+ if ( fields.isEmpty() ) {
+ BSONObjBuilder b;
+ for( map<string,FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+ b.append( i->first, 1 );
+ }
+ fields = b.obj();
+ }
+ BSONObjBuilder b;
+ BSONObjIterator i( fields );
+ while( i.more() ) {
+ BSONElement e = i.next();
+ const char *name = e.fieldName();
+ const FieldRange &eRange = range( name );
+ assert( !eRange.empty() );
+ if ( eRange.equality() )
+ b.appendAs( eRange.min(), name );
+ else if ( eRange.nontrivial() ) {
+ BSONObj o;
+ BSONObjBuilder c;
+ if ( eRange.min().type() != MinKey )
+ c.appendAs( eRange.min(), eRange.minInclusive() ? "$gte" : "$gt" );
+ if ( eRange.max().type() != MaxKey )
+ c.appendAs( eRange.max(), eRange.maxInclusive() ? "$lte" : "$lt" );
+ o = c.obj();
+ b.append( name, o );
+ }
+ }
+ return b.obj();
+ }
+
+ QueryPattern FieldRangeSet::pattern( const BSONObj &sort ) const {
+ return QueryPattern( *this, sort );
+ }
+
+ // TODO get rid of this
+ BoundList FieldRangeSet::indexBounds( const BSONObj &keyPattern, int direction ) const {
+ typedef vector<pair<shared_ptr<BSONObjBuilder>, shared_ptr<BSONObjBuilder> > > BoundBuilders;
+ BoundBuilders builders;
+ builders.push_back( make_pair( shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ), shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ) ) );
+ BSONObjIterator i( keyPattern );
+ bool ineq = false; // until ineq is true, we are just dealing with equality and $in bounds
+ while( i.more() ) {
+ BSONElement e = i.next();
+ const FieldRange &fr = range( e.fieldName() );
+ int number = (int) e.number(); // returns 0.0 if not numeric
+ bool forward = ( ( number >= 0 ? 1 : -1 ) * ( direction >= 0 ? 1 : -1 ) > 0 );
+ if ( !ineq ) {
+ if ( fr.equality() ) {
+ for( BoundBuilders::const_iterator j = builders.begin(); j != builders.end(); ++j ) {
+ j->first->appendAs( fr.min(), "" );
+ j->second->appendAs( fr.min(), "" );
+ }
+ }
+ else {
+ if ( !fr.inQuery() ) {
+ ineq = true;
+ }
+ BoundBuilders newBuilders;
+ const vector<FieldInterval> &intervals = fr.intervals();
+ for( BoundBuilders::const_iterator i = builders.begin(); i != builders.end(); ++i ) {
+ BSONObj first = i->first->obj();
+ BSONObj second = i->second->obj();
+
+ const unsigned maxCombinations = 4000000;
+ if ( forward ) {
+ for( vector<FieldInterval>::const_iterator j = intervals.begin(); j != intervals.end(); ++j ) {
+ uassert( 13303, "combinatorial limit of $in partitioning of result set exceeded", newBuilders.size() < maxCombinations );
+ newBuilders.push_back( make_pair( shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ), shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ) ) );
+ newBuilders.back().first->appendElements( first );
+ newBuilders.back().second->appendElements( second );
+ newBuilders.back().first->appendAs( j->_lower._bound, "" );
+ newBuilders.back().second->appendAs( j->_upper._bound, "" );
+ }
+ }
+ else {
+ for( vector<FieldInterval>::const_reverse_iterator j = intervals.rbegin(); j != intervals.rend(); ++j ) {
+ uassert( 13304, "combinatorial limit of $in partitioning of result set exceeded", newBuilders.size() < maxCombinations );
+ newBuilders.push_back( make_pair( shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ), shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ) ) );
+ newBuilders.back().first->appendElements( first );
+ newBuilders.back().second->appendElements( second );
+ newBuilders.back().first->appendAs( j->_upper._bound, "" );
+ newBuilders.back().second->appendAs( j->_lower._bound, "" );
+ }
+ }
+ }
+ builders = newBuilders;
+ }
+ }
+ else {
+ for( BoundBuilders::const_iterator j = builders.begin(); j != builders.end(); ++j ) {
+ j->first->appendAs( forward ? fr.min() : fr.max(), "" );
+ j->second->appendAs( forward ? fr.max() : fr.min(), "" );
+ }
+ }
+ }
+ BoundList ret;
+ for( BoundBuilders::const_iterator i = builders.begin(); i != builders.end(); ++i )
+ ret.push_back( make_pair( i->first->obj(), i->second->obj() ) );
+ return ret;
+ }
+
+ FieldRangeSet *FieldRangeSet::subset( const BSONObj &fields ) const {
+ FieldRangeSet *ret = new FieldRangeSet( _ns, BSONObj(), _singleKey, true );
+ BSONObjIterator i( fields );
+ while( i.more() ) {
+ BSONElement e = i.next();
+ if ( range( e.fieldName() ).nontrivial() ) {
+ ret->range( e.fieldName() ) = range( e.fieldName() );
+ }
+ }
+ ret->_queries = _queries;
+ return ret;
+ }
+
+ bool FieldRangeSetPair::noNontrivialRanges() const {
+ return _singleKey.matchPossible() && _singleKey.nNontrivialRanges() == 0 &&
+ _multiKey.matchPossible() && _multiKey.nNontrivialRanges() == 0;
+ }
+
+ FieldRangeSetPair &FieldRangeSetPair::operator&=( const FieldRangeSetPair &other ) {
+ _singleKey &= other._singleKey;
+ _multiKey &= other._multiKey;
+ return *this;
+ }
+
+ FieldRangeSetPair &FieldRangeSetPair::operator-=( const FieldRangeSet &scanned ) {
+ _singleKey -= scanned;
+ _multiKey -= scanned;
+ return *this;
+ }
+
+ BSONObj FieldRangeSetPair::simplifiedQueryForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const {
+ return frsForIndex( d, idxNo ).simplifiedQuery( keyPattern );
+ }
+
+ void FieldRangeSetPair::assertValidIndex( const NamespaceDetails *d, int idxNo ) const {
+ massert( 14048, "FieldRangeSetPair invalid index specified", idxNo >= 0 && idxNo < d->nIndexes );
+ }
+
+ const FieldRangeSet &FieldRangeSetPair::frsForIndex( const NamespaceDetails* nsd, int idxNo ) const {
+ assertValidIndexOrNoIndex( nsd, idxNo );
+ if ( idxNo < 0 ) {
+ // An unindexed cursor cannot have a "single key" constraint.
+ return _multiKey;
+ }
+ return nsd->isMultikey( idxNo ) ? _multiKey : _singleKey;
+ }
+
+ bool FieldRangeVector::matchesElement( const BSONElement &e, int i, bool forward ) const {
+ bool eq;
+ int l = matchingLowElement( e, i, forward, eq );
+ return ( l % 2 == 0 ); // if we're inside an interval
+ }
+
+ // binary search for interval containing the specified element
+ // an even return value indicates that the element is contained within a valid interval
+ int FieldRangeVector::matchingLowElement( const BSONElement &e, int i, bool forward, bool &lowEquality ) const {
+ lowEquality = false;
+ int l = -1;
+ int h = _ranges[ i ].intervals().size() * 2;
+ while( l + 1 < h ) {
+ int m = ( l + h ) / 2;
+ BSONElement toCmp;
+ bool toCmpInclusive;
+ const FieldInterval &interval = _ranges[ i ].intervals()[ m / 2 ];
+ if ( m % 2 == 0 ) {
+ toCmp = interval._lower._bound;
+ toCmpInclusive = interval._lower._inclusive;
+ }
+ else {
+ toCmp = interval._upper._bound;
+ toCmpInclusive = interval._upper._inclusive;
+ }
+ int cmp = toCmp.woCompare( e, false );
+ if ( !forward ) {
+ cmp = -cmp;
+ }
+ if ( cmp < 0 ) {
+ l = m;
+ }
+ else if ( cmp > 0 ) {
+ h = m;
+ }
+ else {
+ if ( m % 2 == 0 ) {
+ lowEquality = true;
+ }
+ int ret = m;
+ // if left match and inclusive, all good
+ // if left match and not inclusive, return right before left bound
+ // if right match and inclusive, return left bound
+ // if right match and not inclusive, return right bound
+ if ( ( m % 2 == 0 && !toCmpInclusive ) || ( m % 2 == 1 && toCmpInclusive ) ) {
+ --ret;
+ }
+ return ret;
+ }
+ }
+ assert( l + 1 == h );
+ return l;
+ }
+
+ bool FieldRangeVector::matchesKey( const BSONObj &key ) const {
+ BSONObjIterator j( key );
+ BSONObjIterator k( _indexSpec.keyPattern );
+ for( int l = 0; l < (int)_ranges.size(); ++l ) {
+ int number = (int) k.next().number();
+ bool forward = ( number >= 0 ? 1 : -1 ) * ( _direction >= 0 ? 1 : -1 ) > 0;
+ if ( !matchesElement( j.next(), l, forward ) ) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ bool FieldRangeVector::matches( const BSONObj &obj ) const {
+
+ bool ok = false;
+
+ // TODO The representation of matching keys could potentially be optimized
+ // more for the case at hand. (For example, we can potentially consider
+ // fields individually instead of constructing several bson objects using
+ // multikey arrays.) But getKeys() canonically defines the key set for a
+ // given object and for now we are using it as is.
+ BSONObjSet keys;
+ _indexSpec.getKeys( obj, keys );
+ for( BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i ) {
+ if ( matchesKey( *i ) ) {
+ ok = true;
+ break;
+ }
+ }
+
+ LOG(5) << "FieldRangeVector::matches() returns " << ok << endl;
+
+ return ok;
+ }
+
+ BSONObj FieldRangeVector::firstMatch( const BSONObj &obj ) const {
+ // NOTE Only works in forward direction.
+ assert( _direction >= 0 );
+ BSONObjSet keys( BSONObjCmp( _indexSpec.keyPattern ) );
+ _indexSpec.getKeys( obj, keys );
+ for( BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i ) {
+ if ( matchesKey( *i ) ) {
+ return *i;
+ }
+ }
+ return BSONObj();
+ }
+
+ // TODO optimize more
+ int FieldRangeVectorIterator::advance( const BSONObj &curr ) {
+ BSONObjIterator j( curr );
+ BSONObjIterator o( _v._indexSpec.keyPattern );
+ // track first field for which we are not at the end of the valid values,
+ // since we may need to advance from the key prefix ending with this field
+ int latestNonEndpoint = -1;
+ // iterate over fields to determine appropriate advance method
+ for( int i = 0; i < (int)_i.size(); ++i ) {
+ if ( i > 0 && !_v._ranges[ i - 1 ].intervals()[ _i[ i - 1 ] ].equality() ) {
+ // if last bound was inequality, we don't know anything about where we are for this field
+ // TODO if possible avoid this certain cases when value in previous field of the previous
+ // key is the same as value of previous field in current key
+ setMinus( i );
+ }
+ bool eq = false;
+ BSONElement oo = o.next();
+ bool reverse = ( ( oo.number() < 0 ) ^ ( _v._direction < 0 ) );
+ BSONElement jj = j.next();
+ if ( _i[ i ] == -1 ) { // unknown position for this field, do binary search
+ bool lowEquality;
+ int l = _v.matchingLowElement( jj, i, !reverse, lowEquality );
+ if ( l % 2 == 0 ) { // we are in a valid range for this field
+ _i[ i ] = l / 2;
+ int diff = (int)_v._ranges[ i ].intervals().size() - _i[ i ];
+ if ( diff > 1 ) {
+ latestNonEndpoint = i;
+ }
+ else if ( diff == 1 ) {
+ int x = _v._ranges[ i ].intervals()[ _i[ i ] ]._upper._bound.woCompare( jj, false );
+ if ( x != 0 ) {
+ latestNonEndpoint = i;
+ }
+ }
+ continue;
+ }
+ else { // not in a valid range for this field - determine if and how to advance
+ // check if we're after the last interval for this field
+ if ( l == (int)_v._ranges[ i ].intervals().size() * 2 - 1 ) {
+ if ( latestNonEndpoint == -1 ) {
+ return -2;
+ }
+ setZero( latestNonEndpoint + 1 );
+ // skip to curr / latestNonEndpoint + 1 / superlative
+ _after = true;
+ return latestNonEndpoint + 1;
+ }
+ _i[ i ] = ( l + 1 ) / 2;
+ if ( lowEquality ) {
+ // skip to curr / i + 1 / superlative
+ _after = true;
+ return i + 1;
+ }
+ // skip to curr / i / nextbounds
+ _cmp[ i ] = &_v._ranges[ i ].intervals()[ _i[ i ] ]._lower._bound;
+ _inc[ i ] = _v._ranges[ i ].intervals()[ _i[ i ] ]._lower._inclusive;
+ for( int j = i + 1; j < (int)_i.size(); ++j ) {
+ _cmp[ j ] = &_v._ranges[ j ].intervals().front()._lower._bound;
+ _inc[ j ] = _v._ranges[ j ].intervals().front()._lower._inclusive;
+ }
+ _after = false;
+ return i;
+ }
+ }
+ bool first = true;
+ // _i[ i ] != -1, so we have a starting interval for this field
+ // which serves as a lower/equal bound on the first iteration -
+ // we advance from this interval to find a matching interval
+ while( _i[ i ] < (int)_v._ranges[ i ].intervals().size() ) {
+ // compare to current interval's upper bound
+ int x = _v._ranges[ i ].intervals()[ _i[ i ] ]._upper._bound.woCompare( jj, false );
+ if ( reverse ) {
+ x = -x;
+ }
+ if ( x == 0 && _v._ranges[ i ].intervals()[ _i[ i ] ]._upper._inclusive ) {
+ eq = true;
+ break;
+ }
+ // see if we're less than the upper bound
+ if ( x > 0 ) {
+ if ( i == 0 && first ) {
+ // the value of 1st field won't go backward, so don't check lower bound
+ // TODO maybe we can check first only?
+ break;
+ }
+ // if it's an equality interval, don't need to compare separately to lower bound
+ if ( !_v._ranges[ i ].intervals()[ _i[ i ] ].equality() ) {
+ // compare to current interval's lower bound
+ x = _v._ranges[ i ].intervals()[ _i[ i ] ]._lower._bound.woCompare( jj, false );
+ if ( reverse ) {
+ x = -x;
+ }
+ }
+ // if we're equal to and not inclusive the lower bound, advance
+ if ( ( x == 0 && !_v._ranges[ i ].intervals()[ _i[ i ] ]._lower._inclusive ) ) {
+ setZero( i + 1 );
+ // skip to curr / i + 1 / superlative
+ _after = true;
+ return i + 1;
+ }
+ // if we're less than the lower bound, advance
+ if ( x > 0 ) {
+ setZero( i + 1 );
+ // skip to curr / i / nextbounds
+ _cmp[ i ] = &_v._ranges[ i ].intervals()[ _i[ i ] ]._lower._bound;
+ _inc[ i ] = _v._ranges[ i ].intervals()[ _i[ i ] ]._lower._inclusive;
+ for( int j = i + 1; j < (int)_i.size(); ++j ) {
+ _cmp[ j ] = &_v._ranges[ j ].intervals().front()._lower._bound;
+ _inc[ j ] = _v._ranges[ j ].intervals().front()._lower._inclusive;
+ }
+ _after = false;
+ return i;
+ }
+ else {
+ break;
+ }
+ }
+ // we're above the upper bound, so try next interval and reset remaining fields
+ ++_i[ i ];
+ setZero( i + 1 );
+ first = false;
+ }
+ int diff = (int)_v._ranges[ i ].intervals().size() - _i[ i ];
+ if ( diff > 1 || ( !eq && diff == 1 ) ) {
+ // check if we're not at the end of valid values for this field
+ latestNonEndpoint = i;
+ }
+ else if ( diff == 0 ) { // check if we're past the last interval for this field
+ if ( latestNonEndpoint == -1 ) {
+ return -2;
+ }
+ // more values possible, skip...
+ setZero( latestNonEndpoint + 1 );
+ // skip to curr / latestNonEndpoint + 1 / superlative
+ _after = true;
+ return latestNonEndpoint + 1;
+ }
+ }
+ return -1;
+ }
+
+ void FieldRangeVectorIterator::prepDive() {
+ for( int j = 0; j < (int)_i.size(); ++j ) {
+ _cmp[ j ] = &_v._ranges[ j ].intervals().front()._lower._bound;
+ _inc[ j ] = _v._ranges[ j ].intervals().front()._lower._inclusive;
+ }
+ }
+
+ BSONObj FieldRangeVectorIterator::startKey() {
+ BSONObjBuilder b;
+ for( int unsigned i = 0; i < _i.size(); ++i ) {
+ const FieldInterval &fi = _v._ranges[ i ].intervals()[ _i[ i ] ];
+ b.appendAs( fi._lower._bound, "" );
+ }
+ return b.obj();
+ }
+
+ // temp
+ BSONObj FieldRangeVectorIterator::endKey() {
+ BSONObjBuilder b;
+ for( int unsigned i = 0; i < _i.size(); ++i ) {
+ const FieldInterval &fi = _v._ranges[ i ].intervals()[ _i[ i ] ];
+ b.appendAs( fi._upper._bound, "" );
+ }
+ return b.obj();
+ }
+
+ OrRangeGenerator::OrRangeGenerator( const char *ns, const BSONObj &query , bool optimize )
+ : _baseSet( ns, query, optimize ), _orFound() {
+
+ BSONObjIterator i( _baseSet.originalQuery() );
+
+ while( i.more() ) {
+ BSONElement e = i.next();
+ if ( strcmp( e.fieldName(), "$or" ) == 0 ) {
+ uassert( 13262, "$or requires nonempty array", e.type() == Array && e.embeddedObject().nFields() > 0 );
+ BSONObjIterator j( e.embeddedObject() );
+ while( j.more() ) {
+ BSONElement f = j.next();
+ uassert( 13263, "$or array must contain objects", f.type() == Object );
+ _orSets.push_back( FieldRangeSetPair( ns, f.embeddedObject(), optimize ) );
+ uassert( 13291, "$or may not contain 'special' query", _orSets.back().getSpecial().empty() );
+ _originalOrSets.push_back( _orSets.back() );
+ }
+ _orFound = true;
+ continue;
+ }
+ }
+ }
+
+ void OrRangeGenerator::assertMayPopOrClause() {
+ massert( 13274, "no or clause to pop", !orFinished() );
+ }
+
+ void OrRangeGenerator::popOrClause( NamespaceDetails *nsd, int idxNo, const BSONObj &keyPattern ) {
+ assertMayPopOrClause();
+ auto_ptr<FieldRangeSet> holder;
+ const FieldRangeSet *toDiff = &_originalOrSets.front().frsForIndex( nsd, idxNo );
+ BSONObj indexSpec = keyPattern;
+ if ( !indexSpec.isEmpty() && toDiff->matchPossibleForIndex( indexSpec ) ) {
+ holder.reset( toDiff->subset( indexSpec ) );
+ toDiff = holder.get();
+ }
+ popOrClause( toDiff, nsd, idxNo, keyPattern );
+ }
+
+ void OrRangeGenerator::popOrClauseSingleKey() {
+ assertMayPopOrClause();
+ FieldRangeSet *toDiff = &_originalOrSets.front()._singleKey;
+ popOrClause( toDiff );
+ }
+
+ /**
+ * Removes the top or clause, which would have been recently scanned, and
+ * removes the field ranges it covers from all subsequent or clauses. As a
+ * side effect, this function may invalidate the return values of topFrs()
+ * calls made before this function was called.
+ * @param indexSpec - Keys of the index that was used to satisfy the last or
+ * clause. Used to determine the range of keys that were scanned. If
+ * empty we do not constrain the previous clause's ranges using index keys,
+ * which may reduce opportunities for range elimination.
+ */
+ void OrRangeGenerator::popOrClause( const FieldRangeSet *toDiff, NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) {
+ list<FieldRangeSetPair>::iterator i = _orSets.begin();
+ list<FieldRangeSetPair>::iterator j = _originalOrSets.begin();
+ ++i;
+ ++j;
+ while( i != _orSets.end() ) {
+ *i -= *toDiff;
+ // Check if match is possible at all, and if it is possible for the recently scanned index.
+ if( !i->matchPossible() || ( d && !i->matchPossibleForIndex( d, idxNo, keyPattern ) ) ) {
+ i = _orSets.erase( i );
+ j = _originalOrSets.erase( j );
+ }
+ else {
+ ++i;
+ ++j;
+ }
+ }
+ _oldOrSets.push_front( _orSets.front() );
+ _orSets.pop_front();
+ _originalOrSets.pop_front();
+ }
+
+ struct SimpleRegexUnitTest : UnitTest {
+ void run() {
+ {
+ BSONObjBuilder b;
+ b.appendRegex("r", "^foo");
+ BSONObj o = b.done();
+ assert( simpleRegex(o.firstElement()) == "foo" );
+ }
+ {
+ BSONObjBuilder b;
+ b.appendRegex("r", "^f?oo");
+ BSONObj o = b.done();
+ assert( simpleRegex(o.firstElement()) == "" );
+ }
+ {
+ BSONObjBuilder b;
+ b.appendRegex("r", "^fz?oo");
+ BSONObj o = b.done();
+ assert( simpleRegex(o.firstElement()) == "f" );
+ }
+ {
+ BSONObjBuilder b;
+ b.appendRegex("r", "^f", "");
+ BSONObj o = b.done();
+ assert( simpleRegex(o.firstElement()) == "f" );
+ }
+ {
+ BSONObjBuilder b;
+ b.appendRegex("r", "\\Af", "");
+ BSONObj o = b.done();
+ assert( simpleRegex(o.firstElement()) == "f" );
+ }
+ {
+ BSONObjBuilder b;
+ b.appendRegex("r", "^f", "m");
+ BSONObj o = b.done();
+ assert( simpleRegex(o.firstElement()) == "" );
+ }
+ {
+ BSONObjBuilder b;
+ b.appendRegex("r", "\\Af", "m");
+ BSONObj o = b.done();
+ assert( simpleRegex(o.firstElement()) == "f" );
+ }
+ {
+ BSONObjBuilder b;
+ b.appendRegex("r", "\\Af", "mi");
+ BSONObj o = b.done();
+ assert( simpleRegex(o.firstElement()) == "" );
+ }
+ {
+ BSONObjBuilder b;
+ b.appendRegex("r", "\\Af \t\vo\n\ro \\ \\# #comment", "mx");
+ BSONObj o = b.done();
+ assert( simpleRegex(o.firstElement()) == "foo #" );
+ }
+ {
+ assert( simpleRegex("^\\Qasdf\\E", "", NULL) == "asdf" );
+ assert( simpleRegex("^\\Qasdf\\E.*", "", NULL) == "asdf" );
+ assert( simpleRegex("^\\Qasdf", "", NULL) == "asdf" ); // PCRE supports this
+ assert( simpleRegex("^\\Qasdf\\\\E", "", NULL) == "asdf\\" );
+ assert( simpleRegex("^\\Qas.*df\\E", "", NULL) == "as.*df" );
+ assert( simpleRegex("^\\Qas\\Q[df\\E", "", NULL) == "as\\Q[df" );
+ assert( simpleRegex("^\\Qas\\E\\\\E\\Q$df\\E", "", NULL) == "as\\E$df" ); // quoted string containing \E
+ }
+
+ }
+ } simple_regex_unittest;
+
+
+ long long applySkipLimit( long long num , const BSONObj& cmd ) {
+ BSONElement s = cmd["skip"];
+ BSONElement l = cmd["limit"];
+
+ if ( s.isNumber() ) {
+ num = num - s.numberLong();
+ if ( num < 0 ) {
+ num = 0;
+ }
+ }
+
+ if ( l.isNumber() ) {
+ long long limit = l.numberLong();
+ if ( limit < num ) {
+ num = limit;
+ }
+ }
+
+ return num;
+ }
+
+
+} // namespace mongo
diff --git a/src/mongo/db/queryutil.h b/src/mongo/db/queryutil.h
new file mode 100644
index 00000000000..aefef27cc8b
--- /dev/null
+++ b/src/mongo/db/queryutil.h
@@ -0,0 +1,443 @@
+// @file queryutil.h - Utility classes representing ranges of valid BSONElement values for a query.
+
+/* Copyright 2009 10gen Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "jsobj.h"
+#include "indexkey.h"
+
+namespace mongo {
+
+ /**
+ * One side of an interval of valid BSONElements, specified by a value and a
+ * boolean indicating whether the interval includes the value.
+ */
+ struct FieldBound {
+ BSONElement _bound;
+ bool _inclusive;
+ bool operator==( const FieldBound &other ) const {
+ return _bound.woCompare( other._bound ) == 0 &&
+ _inclusive == other._inclusive;
+ }
+ void flipInclusive() { _inclusive = !_inclusive; }
+ };
+
+ /** A closed interval composed of a lower and an upper FieldBound. */
+ struct FieldInterval {
+ FieldInterval() : _cachedEquality( -1 ) {}
+ FieldInterval( const BSONElement& e ) : _cachedEquality( -1 ) {
+ _lower._bound = _upper._bound = e;
+ _lower._inclusive = _upper._inclusive = true;
+ }
+ FieldBound _lower;
+ FieldBound _upper;
+ /** @return true iff no single element can be contained in the interval. */
+ bool strictValid() const {
+ int cmp = _lower._bound.woCompare( _upper._bound, false );
+ return ( cmp < 0 || ( cmp == 0 && _lower._inclusive && _upper._inclusive ) );
+ }
+ /** @return true iff the interval is an equality constraint. */
+ bool equality() const;
+ mutable int _cachedEquality;
+
+ string toString() const;
+ };
+
+ /**
+ * An ordered list of FieldIntervals expressing constraints on valid
+ * BSONElement values for a field.
+ */
+ class FieldRange {
+ public:
+ FieldRange( const BSONElement &e , bool singleKey , bool isNot=false , bool optimize=true );
+
+ /** @return Range intersection with 'other'. */
+ const FieldRange &operator&=( const FieldRange &other );
+ /** @return Range union with 'other'. */
+ const FieldRange &operator|=( const FieldRange &other );
+ /** @return Range of elements elements included in 'this' but not 'other'. */
+ const FieldRange &operator-=( const FieldRange &other );
+ /** @return true iff this range is a subset of 'other'. */
+ bool operator<=( const FieldRange &other ) const;
+
+ /**
+ * If there are any valid values for this range, the extreme values can
+ * be extracted.
+ */
+
+ BSONElement min() const { assert( !empty() ); return _intervals[ 0 ]._lower._bound; }
+ BSONElement max() const { assert( !empty() ); return _intervals[ _intervals.size() - 1 ]._upper._bound; }
+ bool minInclusive() const { assert( !empty() ); return _intervals[ 0 ]._lower._inclusive; }
+ bool maxInclusive() const { assert( !empty() ); return _intervals[ _intervals.size() - 1 ]._upper._inclusive; }
+
+ /** @return true iff this range expresses a single equality interval. */
+ bool equality() const;
+ /** @return true if all the intervals for this range are equalities */
+ bool inQuery() const;
+ /** @return true iff this range does not include every BSONElement */
+ bool nontrivial() const;
+ /** @return true iff this range matches no BSONElements. */
+ bool empty() const { return _intervals.empty(); }
+
+ /** Empty the range so it matches no BSONElements. */
+ void makeEmpty() { _intervals.clear(); }
+ const vector<FieldInterval> &intervals() const { return _intervals; }
+ string getSpecial() const { return _special; }
+ /** Make component intervals noninclusive. */
+ void setExclusiveBounds();
+ /**
+ * Constructs a range where all FieldIntervals and FieldBounds are in
+ * the opposite order of the current range.
+ * NOTE the resulting intervals might not be strictValid().
+ */
+ void reverse( FieldRange &ret ) const;
+
+ string toString() const;
+ private:
+ BSONObj addObj( const BSONObj &o );
+ void finishOperation( const vector<FieldInterval> &newIntervals, const FieldRange &other );
+ vector<FieldInterval> _intervals;
+ // Owns memory for our BSONElements.
+ vector<BSONObj> _objData;
+ string _special;
+ bool _singleKey;
+ };
+
+ /**
+ * A BoundList contains intervals specified by inclusive start
+ * and end bounds. The intervals should be nonoverlapping and occur in
+ * the specified direction of traversal. For example, given a simple index {i:1}
+ * and direction +1, one valid BoundList is: (1, 2); (4, 6). The same BoundList
+ * would be valid for index {i:-1} with direction -1.
+ */
+ typedef vector<pair<BSONObj,BSONObj> > BoundList;
+
+ class QueryPattern;
+
+ /**
+ * A set of FieldRanges determined from constraints on the fields of a query,
+ * that may be used to determine index bounds.
+ */
+ class FieldRangeSet {
+ public:
+ friend class OrRangeGenerator;
+ friend class FieldRangeVector;
+ FieldRangeSet( const char *ns, const BSONObj &query , bool singleKey , bool optimize=true );
+
+ /** @return true if there is a nontrivial range for the given field. */
+ bool hasRange( const char *fieldName ) const {
+ map<string, FieldRange>::const_iterator f = _ranges.find( fieldName );
+ return f != _ranges.end();
+ }
+ /** @return range for the given field. */
+ const FieldRange &range( const char *fieldName ) const;
+ /** @return range for the given field. */
+ FieldRange &range( const char *fieldName );
+ /** @return the number of nontrivial ranges. */
+ int nNontrivialRanges() const;
+ /** @return the field ranges comprising this set. */
+ const map<string,FieldRange> &ranges() const { return _ranges; }
+ /**
+ * @return true if a match could be possible on every field. Generally this
+ * is not useful information for a single key FieldRangeSet and
+ * matchPossibleForIndex() should be used instead.
+ */
+ bool matchPossible() const;
+ /**
+ * @return true if a match could be possible given the value of _singleKey
+ * and index key 'keyPattern'.
+ * @param keyPattern May be {} or {$natural:1} for a non index scan.
+ */
+ bool matchPossibleForIndex( const BSONObj &keyPattern ) const;
+
+ const char *ns() const { return _ns; }
+
+ /**
+ * @return a simplified query from the extreme values of the nontrivial
+ * fields.
+ * @param fields If specified, the fields of the returned object are
+ * ordered to match those of 'fields'.
+ */
+ BSONObj simplifiedQuery( const BSONObj &fields = BSONObj() ) const;
+
+ QueryPattern pattern( const BSONObj &sort = BSONObj() ) const;
+ string getSpecial() const;
+
+ /**
+ * @return a FieldRangeSet approximation of the documents in 'this' but
+ * not in 'other'. The approximation will be a superset of the documents
+ * in 'this' but not 'other'.
+ */
+ const FieldRangeSet &operator-=( const FieldRangeSet &other );
+ /** @return intersection of 'this' with 'other'. */
+ const FieldRangeSet &operator&=( const FieldRangeSet &other );
+
+ /**
+ * @return an ordered list of bounds generated using an index key pattern
+ * and traversal direction.
+ *
+ * NOTE This function is deprecated in the query optimizer and only
+ * currently used by the sharding code.
+ */
+ BoundList indexBounds( const BSONObj &keyPattern, int direction ) const;
+
+ /**
+ * @return - A new FieldRangeSet based on this FieldRangeSet, but with only
+ * a subset of the fields.
+ * @param fields - Only fields which are represented as field names in this object
+ * will be included in the returned FieldRangeSet.
+ */
+ FieldRangeSet *subset( const BSONObj &fields ) const;
+
+ bool singleKey() const { return _singleKey; }
+
+ BSONObj originalQuery() const { return _queries[ 0 ]; }
+ private:
+ void appendQueries( const FieldRangeSet &other );
+ void makeEmpty();
+ void processQueryField( const BSONElement &e, bool optimize );
+ void processOpElement( const char *fieldName, const BSONElement &f, bool isNot, bool optimize );
+ static FieldRange *__singleKeyTrivialRange;
+ static FieldRange *__multiKeyTrivialRange;
+ const FieldRange &trivialRange() const;
+ map<string,FieldRange> _ranges;
+ const char *_ns;
+ // Owns memory for FieldRange BSONElements.
+ vector<BSONObj> _queries;
+ bool _singleKey;
+ };
+
+ class NamespaceDetails;
+
+ /**
+ * A pair of FieldRangeSets, one representing constraints for single key
+ * indexes and the other representing constraints for multi key indexes and
+ * unindexed scans. In several member functions the caller is asked to
+ * supply an index so that the implementation may utilize the proper
+ * FieldRangeSet and return results that are appropriate with respect to that
+ * supplied index.
+ */
+ class FieldRangeSetPair {
+ public:
+ FieldRangeSetPair( const char *ns, const BSONObj &query, bool optimize=true )
+ :_singleKey( ns, query, true, optimize ), _multiKey( ns, query, false, optimize ) {}
+
+ /**
+ * @return the appropriate single or multi key FieldRangeSet for the specified index.
+ * @param idxNo -1 for non index scan.
+ */
+ const FieldRangeSet &frsForIndex( const NamespaceDetails* nsd, int idxNo ) const;
+
+ /** @return a field range in the single key FieldRangeSet. */
+ const FieldRange &singleKeyRange( const char *fieldName ) const {
+ return _singleKey.range( fieldName );
+ }
+ /** @return true if the range limits are equivalent to an empty query. */
+ bool noNontrivialRanges() const;
+ /** @return false if a match is impossible regardless of index. */
+ bool matchPossible() const { return _multiKey.matchPossible(); }
+ /**
+ * @return false if a match is impossible on the specified index.
+ * @param idxNo -1 for non index scan.
+ */
+ bool matchPossibleForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const;
+
+ const char *ns() const { return _singleKey.ns(); }
+
+ string getSpecial() const { return _singleKey.getSpecial(); }
+
+ /** Intersect with another FieldRangeSetPair. */
+ FieldRangeSetPair &operator&=( const FieldRangeSetPair &other );
+ /**
+ * Subtract a FieldRangeSet, generally one expressing a range that has
+ * already been scanned.
+ */
+ FieldRangeSetPair &operator-=( const FieldRangeSet &scanned );
+
+ BoundList singleKeyIndexBounds( const BSONObj &keyPattern, int direction ) const {
+ return _singleKey.indexBounds( keyPattern, direction );
+ }
+
+ BSONObj originalQuery() const { return _singleKey.originalQuery(); }
+
+ private:
+ FieldRangeSetPair( const FieldRangeSet &singleKey, const FieldRangeSet &multiKey )
+ :_singleKey( singleKey ), _multiKey( multiKey ) {}
+ void assertValidIndex( const NamespaceDetails *d, int idxNo ) const;
+ void assertValidIndexOrNoIndex( const NamespaceDetails *d, int idxNo ) const;
+ /** matchPossibleForIndex() must be true. */
+ BSONObj simplifiedQueryForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const;
+ FieldRangeSet _singleKey;
+ FieldRangeSet _multiKey;
+ friend class OrRangeGenerator;
+ friend struct QueryUtilIndexed;
+ };
+
+ class IndexSpec;
+
+ /**
+ * An ordered list of fields and their FieldRanges, corresponding to valid
+ * index keys for a given index spec.
+ */
+ class FieldRangeVector {
+ public:
+ /**
+ * @param frs The valid ranges for all fields, as defined by the query spec
+ * @param indexSpec The index spec (key pattern and info)
+ * @param direction The direction of index traversal
+ */
+ FieldRangeVector( const FieldRangeSet &frs, const IndexSpec &indexSpec, int direction );
+
+ /** @return the number of index ranges represented by 'this' */
+ long long size();
+ /** @return starting point for an index traversal. */
+ BSONObj startKey() const;
+ /** @return end point for an index traversal. */
+ BSONObj endKey() const;
+ /** @return a client readable representation of 'this' */
+ BSONObj obj() const;
+
+ const IndexSpec& getSpec(){ return _indexSpec; }
+
+ /**
+ * @return true iff the provided document matches valid ranges on all
+ * of this FieldRangeVector's fields, which is the case iff this document
+ * would be returned while scanning the index corresponding to this
+ * FieldRangeVector. This function is used for $or clause deduping.
+ */
+ bool matches( const BSONObj &obj ) const;
+
+ /**
+ * @return first key of 'obj' that would be encountered by a forward
+ * index scan using this FieldRangeVector, BSONObj() if no such key.
+ */
+ BSONObj firstMatch( const BSONObj &obj ) const;
+
+ private:
+ int matchingLowElement( const BSONElement &e, int i, bool direction, bool &lowEquality ) const;
+ bool matchesElement( const BSONElement &e, int i, bool direction ) const;
+ bool matchesKey( const BSONObj &key ) const;
+ vector<FieldRange> _ranges;
+ const IndexSpec _indexSpec;
+ int _direction;
+ vector<BSONObj> _queries; // make sure mem owned
+ friend class FieldRangeVectorIterator;
+ };
+
+ /**
+ * Helper class for iterating through an ordered representation of keys
+ * to find those keys that match a specified FieldRangeVector.
+ */
+ class FieldRangeVectorIterator {
+ public:
+ FieldRangeVectorIterator( const FieldRangeVector &v ) : _v( v ), _i( _v._ranges.size(), -1 ), _cmp( _v._ranges.size(), 0 ), _inc( _v._ranges.size(), false ), _after() {
+ }
+ static BSONObj minObject() {
+ BSONObjBuilder b; b.appendMinKey( "" );
+ return b.obj();
+ }
+ static BSONObj maxObject() {
+ BSONObjBuilder b; b.appendMaxKey( "" );
+ return b.obj();
+ }
+ /**
+ * @return Suggested advance method, based on current key.
+ * -2 Iteration is complete, no need to advance.
+ * -1 Advance to the next key, without skipping.
+ * >=0 Skip parameter. If @return is r, skip to the key comprised
+ * of the first r elements of curr followed by the (r+1)th and
+ * remaining elements of cmp() (with inclusivity specified by
+ * the (r+1)th and remaining elements of inc()). If after() is
+ * true, skip past this key not to it.
+ */
+ int advance( const BSONObj &curr );
+ const vector<const BSONElement *> &cmp() const { return _cmp; }
+ const vector<bool> &inc() const { return _inc; }
+ bool after() const { return _after; }
+ void prepDive();
+ void setZero( int i ) { for( int j = i; j < (int)_i.size(); ++j ) _i[ j ] = 0; }
+ void setMinus( int i ) { for( int j = i; j < (int)_i.size(); ++j ) _i[ j ] = -1; }
+ bool ok() { return _i[ 0 ] < (int)_v._ranges[ 0 ].intervals().size(); }
+ BSONObj startKey();
+ // temp
+ BSONObj endKey();
+ private:
+ const FieldRangeVector &_v;
+ vector<int> _i;
+ vector<const BSONElement*> _cmp;
+ vector<bool> _inc;
+ bool _after;
+ };
+
+ /**
+ * As we iterate through $or clauses this class generates a FieldRangeSetPair
+ * for the current $or clause, in some cases by excluding ranges that were
+ * included in a previous clause.
+ */
+ class OrRangeGenerator {
+ public:
+ OrRangeGenerator( const char *ns, const BSONObj &query , bool optimize=true );
+
+ /**
+ * @return true iff we are done scanning $or clauses. if there's a
+ * useless or clause, we won't use or index ranges to help with scanning.
+ */
+ bool orFinished() const { return _orFound && _orSets.empty(); }
+ /** Iterates to the next $or clause by removing the current $or clause. */
+ void popOrClause( NamespaceDetails *nsd, int idxNo, const BSONObj &keyPattern );
+ void popOrClauseSingleKey();
+ /** @return FieldRangeSetPair for the current $or clause. */
+ FieldRangeSetPair *topFrsp() const;
+ /**
+ * @return original FieldRangeSetPair for the current $or clause. While the
+ * original bounds are looser, they are composed of fewer ranges and it
+ * is faster to do operations with them; when they can be used instead of
+ * more precise bounds, they should.
+ */
+ FieldRangeSetPair *topFrspOriginal() const;
+
+ string getSpecial() const { return _baseSet.getSpecial(); }
+
+ bool moreOrClauses() const { return !_orSets.empty(); }
+ private:
+ void assertMayPopOrClause();
+ void popOrClause( const FieldRangeSet *toDiff, NamespaceDetails *d = 0, int idxNo = -1, const BSONObj &keyPattern = BSONObj() );
+ FieldRangeSetPair _baseSet;
+ list<FieldRangeSetPair> _orSets;
+ list<FieldRangeSetPair> _originalOrSets;
+ // ensure memory is owned
+ list<FieldRangeSetPair> _oldOrSets;
+ bool _orFound;
+ friend struct QueryUtilIndexed;
+ };
+
+ /** returns a string that when used as a matcher, would match a super set of regex()
+ returns "" for complex regular expressions
+ used to optimize queries in some simple regex cases that start with '^'
+
+ if purePrefix != NULL, sets it to whether the regex can be converted to a range query
+ */
+ string simpleRegex(const char* regex, const char* flags, bool* purePrefix=NULL);
+
+ /** returns the upper bound of a query that matches prefix */
+ string simpleRegexEnd( string prefix );
+
+ long long applySkipLimit( long long num , const BSONObj& cmd );
+
+} // namespace mongo
+
+#include "queryutil-inl.h"
diff --git a/src/mongo/db/record.cpp b/src/mongo/db/record.cpp
new file mode 100644
index 00000000000..17987002efc
--- /dev/null
+++ b/src/mongo/db/record.cpp
@@ -0,0 +1,267 @@
+// record.cpp
+
+#include "pch.h"
+#include "pdfile.h"
+#include "../util/processinfo.h"
+#include "../util/net/listen.h"
+#include "pagefault.h"
+
+namespace mongo {
+
+ namespace ps {
+
+ enum State {
+ In , Out, Unk
+ };
+
+ enum Constants {
+ SliceSize = 65536 ,
+ MaxChain = 20 , // intentionally very low
+ NumSlices = 10 ,
+ RotateTimeSecs = 90
+ };
+
+ int hash( size_t region ) {
+ return
+ abs( ( ( 7 + (int)(region & 0xFFFF) )
+ * ( 11 + (int)( ( region >> 16 ) & 0xFFFF ) )
+#if defined(_WIN64) || defined(__amd64__)
+ * ( 13 + (int)( ( region >> 32 ) & 0xFFFF ) )
+ * ( 17 + (int)( ( region >> 48 ) & 0xFFFF ) )
+#endif
+ ) % SliceSize );
+ }
+
+
+ /**
+ * simple hash map for region -> status
+ * this constitures a single region of time
+ * it does chaining, but very short chains
+ */
+ class Slice {
+
+ struct Entry {
+ size_t region;
+ unsigned long long value;
+ };
+
+ public:
+
+ Slice() {
+ reset();
+ }
+
+ void reset() {
+ memset( _data , 0 , SliceSize * sizeof(Entry) );
+ }
+
+ State get( int regionHash , size_t region , short offset ) {
+ DEV assert( hash( region ) == regionHash );
+
+ Entry * e = _get( regionHash , region , false );
+ if ( ! e )
+ return Unk;
+
+ return ( e->value & ( ((unsigned long long)1) << offset ) ) ? In : Out;
+ }
+
+ /**
+ * @return true if added, false if full
+ */
+ bool in( int regionHash , size_t region , short offset ) {
+ DEV assert( hash( region ) == regionHash );
+
+ Entry * e = _get( regionHash , region , true );
+ if ( ! e )
+ return false;
+
+ e->value |= ((unsigned long long)1) << offset;
+ return true;
+ }
+
+ private:
+
+ Entry* _get( int start , size_t region , bool add ) {
+ for ( int i=0; i<MaxChain; i++ ) {
+
+ int bucket = ( start + i ) % SliceSize;
+
+ if ( _data[bucket].region == 0 ) {
+ if ( ! add )
+ return 0;
+
+ _data[bucket].region = region;
+ return &_data[bucket];
+ }
+
+ if ( _data[bucket].region == region ) {
+ return &_data[bucket];
+ }
+ }
+ return 0;
+ }
+
+ Entry _data[SliceSize];
+ };
+
+
+ /**
+ * this contains many slices of times
+ * the idea you put mem status in the current time slice
+ * and then after a certain period of time, it rolls off so we check again
+ */
+ class Rolling {
+
+ public:
+ Rolling()
+ : _lock( "ps::Rolling" ){
+ _curSlice = 0;
+ _lastRotate = Listener::getElapsedTimeMillis();
+ }
+
+
+ /**
+ * after this call, we assume the page is in ram
+ * @param doHalf if this is a known good access, want to put in first half
+ * @return whether we know the page is in ram
+ */
+ bool access( size_t region , short offset , bool doHalf ) {
+ int regionHash = hash(region);
+
+ SimpleMutex::scoped_lock lk( _lock );
+
+ static int rarely_count = 0;
+ if ( rarely_count++ % 2048 == 0 ) {
+ long long now = Listener::getElapsedTimeMillis();
+ RARELY if ( now == 0 ) {
+ tlog() << "warning Listener::getElapsedTimeMillis returning 0ms" << endl;
+ }
+
+ if ( now - _lastRotate > ( 1000 * RotateTimeSecs ) ) {
+ _rotate();
+ }
+ }
+
+ for ( int i=0; i<NumSlices / ( doHalf ? 2 : 1 ); i++ ) {
+ int pos = (_curSlice+i)%NumSlices;
+ State s = _slices[pos].get( regionHash , region , offset );
+
+ if ( s == In )
+ return true;
+
+ if ( s == Out ) {
+ _slices[pos].in( regionHash , region , offset );
+ return false;
+ }
+ }
+
+ // we weren't in any slice
+ // so add to cur
+ if ( ! _slices[_curSlice].in( regionHash , region , offset ) ) {
+ _rotate();
+ _slices[_curSlice].in( regionHash , region , offset );
+ }
+ return false;
+ }
+
+ private:
+
+ void _rotate() {
+ _curSlice = ( _curSlice + 1 ) % NumSlices;
+ _slices[_curSlice].reset();
+ _lastRotate = Listener::getElapsedTimeMillis();
+ }
+
+ int _curSlice;
+ long long _lastRotate;
+ Slice _slices[NumSlices];
+
+ SimpleMutex _lock;
+ } rolling;
+
+ }
+
+ bool Record::MemoryTrackingEnabled = true;
+
+ volatile int __record_touch_dummy = 1; // this is used to make sure the compiler doesn't get too smart on us
+ void Record::touch( bool entireRecrd ) {
+ if ( lengthWithHeaders > HeaderSize ) { // this also makes sure lengthWithHeaders is in memory
+ char * addr = data;
+ char * end = data + netLength();
+ for ( ; addr <= end ; addr += 2048 ) {
+ __record_touch_dummy += addr[0];
+
+ break; // TODO: remove this, pending SERVER-3711
+
+ // note if this is a touch of a deletedrecord, we don't want to touch more than the first part. we may simply
+ // be updated the linked list and a deletedrecord could be gigantic. similar circumstance just less extreme
+ // exists for any record if we are just updating its header, say on a remove(); some sort of hints might be
+ // useful.
+
+ if ( ! entireRecrd )
+ break;
+ }
+ }
+ }
+
+ const bool blockSupported = ProcessInfo::blockCheckSupported();
+
+ bool Record::likelyInPhysicalMemory() {
+ if ( ! MemoryTrackingEnabled )
+ return true;
+
+ const size_t page = (size_t)data >> 12;
+ const size_t region = page >> 6;
+ const size_t offset = page & 0x3f;
+
+ if ( ps::rolling.access( region , offset , false ) )
+ return true;
+
+ if ( ! blockSupported ) {
+ // this means we don't fallback to system call
+ // and assume things aren't in memory
+ // possible we yield too much - but better than not yielding through a fault
+ return false;
+ }
+
+ return ProcessInfo::blockInMemory( data );
+ }
+
+
+ Record* Record::accessed() {
+ const size_t page = (size_t)data >> 12;
+ const size_t region = page >> 6;
+ const size_t offset = page & 0x3f;
+ ps::rolling.access( region , offset , true );
+ return this;
+ }
+
+ Record* DiskLoc::rec() const {
+ Record *r = DataFileMgr::getRecord(*this);
+#if defined(_PAGEFAULTEXCEPTION)
+ DEV ONCE {
+ log() << "_DEBUG info _PAGEFAULTEXCEPTION is ON -- experimental at this time" << endl;
+ }
+ bool fault = !r->likelyInPhysicalMemory();
+ DEV if( rand() % 100 == 0 )
+ fault = true;
+ if( fault &&
+ !cc()._hasWrittenThisPass &&
+ cc()._pageFaultRetryableSection )
+ {
+ if( cc()._pageFaultRetryableSection->_laps > 100 ) {
+ log() << "info pagefaultexception _laps > 100" << endl;
+ }
+ else {
+ throw PageFaultException(r);
+ }
+ }
+#else
+ DEV ONCE {
+ log() << "_DEBUG info _PAGEFAULTEXCEPTION is off" << endl;
+ }
+#endif
+ return r;
+ }
+
+}
diff --git a/src/mongo/db/repl.cpp b/src/mongo/db/repl.cpp
new file mode 100644
index 00000000000..25ecb6b455f
--- /dev/null
+++ b/src/mongo/db/repl.cpp
@@ -0,0 +1,1516 @@
+// repl.cpp
+
+/* TODO
+ PAIRING
+ _ on a syncexception, don't allow going back to master state?
+*/
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* Collections we use:
+
+ local.sources - indicates what sources we pull from as a "slave", and the last update of each
+ local.oplog.$main - our op log as "master"
+ local.dbinfo.<dbname> - no longer used???
+ local.pair.startup - [deprecated] can contain a special value indicating for a pair that we have the master copy.
+ used when replacing other half of the pair which has permanently failed.
+ local.pair.sync - [deprecated] { initialsynccomplete: 1 }
+*/
+
+#include "pch.h"
+#include "jsobj.h"
+#include "../util/goodies.h"
+#include "repl.h"
+#include "../util/net/message.h"
+#include "../util/background.h"
+#include "../client/dbclient.h"
+#include "../client/connpool.h"
+#include "pdfile.h"
+#include "ops/query.h"
+#include "db.h"
+#include "commands.h"
+#include "security.h"
+#include "cmdline.h"
+#include "repl_block.h"
+#include "repl/rs.h"
+#include "replutil.h"
+#include "repl/connections.h"
+#include "ops/update.h"
+
+namespace mongo {
+
+ // our config from command line etc.
+ ReplSettings replSettings;
+
+ /* if 1 sync() is running */
+ volatile int syncing = 0;
+ static volatile int relinquishSyncingSome = 0;
+
+ /* "dead" means something really bad happened like replication falling completely out of sync.
+ when non-null, we are dead and the string is informational
+ */
+ const char *replAllDead = 0;
+
+ time_t lastForcedResync = 0;
+
+} // namespace mongo
+
+namespace mongo {
+
+ /* output by the web console */
+ const char *replInfo = "";
+ struct ReplInfo {
+ ReplInfo(const char *msg) {
+ replInfo = msg;
+ }
+ ~ReplInfo() {
+ replInfo = "?";
+ }
+ };
+
+ /* operator requested resynchronization of replication (on the slave). { resync : 1 } */
+ class CmdResync : public Command {
+ public:
+ virtual bool slaveOk() const {
+ return true;
+ }
+ virtual bool adminOnly() const {
+ return true;
+ }
+ virtual bool logTheOp() { return false; }
+ virtual LockType locktype() const { return WRITE; }
+ void help(stringstream&h) const { h << "resync (from scratch) an out of date replica slave.\nhttp://www.mongodb.org/display/DOCS/Master+Slave"; }
+ CmdResync() : Command("resync") { }
+ virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ if( cmdLine.usingReplSets() ) {
+ errmsg = "resync command not currently supported with replica sets. See RS102 info in the mongodb documentations";
+ result.append("info", "http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member");
+ return false;
+ }
+
+ if ( cmdObj.getBoolField( "force" ) ) {
+ if ( !waitForSyncToFinish( errmsg ) )
+ return false;
+ replAllDead = "resync forced";
+ }
+ if ( !replAllDead ) {
+ errmsg = "not dead, no need to resync";
+ return false;
+ }
+ if ( !waitForSyncToFinish( errmsg ) )
+ return false;
+
+ ReplSource::forceResyncDead( "client" );
+ result.append( "info", "triggered resync for all sources" );
+ return true;
+ }
+ bool waitForSyncToFinish( string &errmsg ) const {
+ // Wait for slave thread to finish syncing, so sources will be be
+ // reloaded with new saved state on next pass.
+ Timer t;
+ while ( 1 ) {
+ if ( syncing == 0 || t.millis() > 30000 )
+ break;
+ {
+ dbtemprelease t;
+ relinquishSyncingSome = 1;
+ sleepmillis(1);
+ }
+ }
+ if ( syncing ) {
+ errmsg = "timeout waiting for sync() to finish";
+ return false;
+ }
+ return true;
+ }
+ } cmdResync;
+
+ bool anyReplEnabled() {
+ return replSettings.slave || replSettings.master || theReplSet;
+ }
+
+ bool replAuthenticate(DBClientBase *conn);
+
+ void appendReplicationInfo( BSONObjBuilder& result , bool authed , int level ) {
+
+ if ( replSet ) {
+ if( theReplSet == 0 ) {
+ result.append("ismaster", false);
+ result.append("secondary", false);
+ result.append("info", ReplSet::startupStatusMsg.get());
+ result.append( "isreplicaset" , true );
+ return;
+ }
+
+ theReplSet->fillIsMaster(result);
+ return;
+ }
+
+ if ( replAllDead ) {
+ result.append("ismaster", 0);
+ string s = string("dead: ") + replAllDead;
+ result.append("info", s);
+ }
+ else {
+ result.appendBool("ismaster", _isMaster() );
+ }
+
+ if ( level && replSet ) {
+ result.append( "info" , "is replica set" );
+ }
+ else if ( level ) {
+ BSONObjBuilder sources( result.subarrayStart( "sources" ) );
+
+ readlock lk( "local.sources" );
+ Client::Context ctx( "local.sources", dbpath, authed );
+ shared_ptr<Cursor> c = findTableScan("local.sources", BSONObj());
+ int n = 0;
+ while ( c->ok() ) {
+ BSONObj s = c->current();
+
+ BSONObjBuilder bb;
+ bb.append( s["host"] );
+ string sourcename = s["source"].valuestr();
+ if ( sourcename != "main" )
+ bb.append( s["source"] );
+
+ {
+ BSONElement e = s["syncedTo"];
+ BSONObjBuilder t( bb.subobjStart( "syncedTo" ) );
+ t.appendDate( "time" , e.timestampTime() );
+ t.append( "inc" , e.timestampInc() );
+ t.done();
+ }
+
+ if ( level > 1 ) {
+ dbtemprelease unlock;
+ // note: there is no so-style timeout on this connection; perhaps we should have one.
+ ScopedDbConnection conn( s["host"].valuestr() );
+ DBClientConnection *cliConn = dynamic_cast< DBClientConnection* >( &conn.conn() );
+ if ( cliConn && replAuthenticate( cliConn ) ) {
+ BSONObj first = conn->findOne( (string)"local.oplog.$" + sourcename , Query().sort( BSON( "$natural" << 1 ) ) );
+ BSONObj last = conn->findOne( (string)"local.oplog.$" + sourcename , Query().sort( BSON( "$natural" << -1 ) ) );
+ bb.appendDate( "masterFirst" , first["ts"].timestampTime() );
+ bb.appendDate( "masterLast" , last["ts"].timestampTime() );
+ double lag = (double) (last["ts"].timestampTime() - s["syncedTo"].timestampTime());
+ bb.append( "lagSeconds" , lag / 1000 );
+ }
+ conn.done();
+ }
+
+ sources.append( BSONObjBuilder::numStr( n++ ) , bb.obj() );
+ c->advance();
+ }
+
+ sources.done();
+ }
+ }
+
+ class CmdIsMaster : public Command {
+ public:
+ virtual bool requiresAuth() { return false; }
+ virtual bool slaveOk() const {
+ return true;
+ }
+ virtual void help( stringstream &help ) const {
+ help << "Check if this server is primary for a replica pair/set; also if it is --master or --slave in simple master/slave setups.\n";
+ help << "{ isMaster : 1 }";
+ }
+ virtual LockType locktype() const { return NONE; }
+ CmdIsMaster() : Command("isMaster", true, "ismaster") { }
+ virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+ /* currently request to arbiter is (somewhat arbitrarily) an ismaster request that is not
+ authenticated.
+ we allow unauthenticated ismaster but we aren't as verbose informationally if
+ one is not authenticated for admin db to be safe.
+ */
+ bool authed = cc().getAuthenticationInfo()->isAuthorizedReads("admin");
+ appendReplicationInfo( result , authed );
+
+ result.appendNumber("maxBsonObjectSize", BSONObjMaxUserSize);
+ return true;
+ }
+ } cmdismaster;
+
+ ReplSource::ReplSource() {
+ nClonedThisPass = 0;
+ }
+
+ ReplSource::ReplSource(BSONObj o) : nClonedThisPass(0) {
+ only = o.getStringField("only");
+ hostName = o.getStringField("host");
+ _sourceName = o.getStringField("source");
+ uassert( 10118 , "'host' field not set in sources collection object", !hostName.empty() );
+ uassert( 10119 , "only source='main' allowed for now with replication", sourceName() == "main" );
+ BSONElement e = o.getField("syncedTo");
+ if ( !e.eoo() ) {
+ uassert( 10120 , "bad sources 'syncedTo' field value", e.type() == Date || e.type() == Timestamp );
+ OpTime tmp( e.date() );
+ syncedTo = tmp;
+ }
+
+ BSONObj dbsObj = o.getObjectField("dbsNextPass");
+ if ( !dbsObj.isEmpty() ) {
+ BSONObjIterator i(dbsObj);
+ while ( 1 ) {
+ BSONElement e = i.next();
+ if ( e.eoo() )
+ break;
+ addDbNextPass.insert( e.fieldName() );
+ }
+ }
+
+ dbsObj = o.getObjectField("incompleteCloneDbs");
+ if ( !dbsObj.isEmpty() ) {
+ BSONObjIterator i(dbsObj);
+ while ( 1 ) {
+ BSONElement e = i.next();
+ if ( e.eoo() )
+ break;
+ incompleteCloneDbs.insert( e.fieldName() );
+ }
+ }
+ }
+
+ /* Turn our C++ Source object into a BSONObj */
+ BSONObj ReplSource::jsobj() {
+ BSONObjBuilder b;
+ b.append("host", hostName);
+ b.append("source", sourceName());
+ if ( !only.empty() )
+ b.append("only", only);
+ if ( !syncedTo.isNull() )
+ b.appendTimestamp("syncedTo", syncedTo.asDate());
+
+ BSONObjBuilder dbsNextPassBuilder;
+ int n = 0;
+ for ( set<string>::iterator i = addDbNextPass.begin(); i != addDbNextPass.end(); i++ ) {
+ n++;
+ dbsNextPassBuilder.appendBool(*i, 1);
+ }
+ if ( n )
+ b.append("dbsNextPass", dbsNextPassBuilder.done());
+
+ BSONObjBuilder incompleteCloneDbsBuilder;
+ n = 0;
+ for ( set<string>::iterator i = incompleteCloneDbs.begin(); i != incompleteCloneDbs.end(); i++ ) {
+ n++;
+ incompleteCloneDbsBuilder.appendBool(*i, 1);
+ }
+ if ( n )
+ b.append("incompleteCloneDbs", incompleteCloneDbsBuilder.done());
+
+ return b.obj();
+ }
+
+ void ReplSource::save() {
+ BSONObjBuilder b;
+ assert( !hostName.empty() );
+ b.append("host", hostName);
+ // todo: finish allowing multiple source configs.
+ // this line doesn't work right when source is null, if that is allowed as it is now:
+ //b.append("source", _sourceName);
+ BSONObj pattern = b.done();
+
+ BSONObj o = jsobj();
+ log( 1 ) << "Saving repl source: " << o << endl;
+
+ {
+ OpDebug debug;
+ Client::Context ctx("local.sources");
+ UpdateResult res = updateObjects("local.sources", o, pattern, true/*upsert for pair feature*/, false,false,debug);
+ assert( ! res.mod );
+ assert( res.num == 1 );
+ }
+ }
+
+ static void addSourceToList(ReplSource::SourceVector &v, ReplSource& s, ReplSource::SourceVector &old) {
+ if ( !s.syncedTo.isNull() ) { // Don't reuse old ReplSource if there was a forced resync.
+ for ( ReplSource::SourceVector::iterator i = old.begin(); i != old.end(); ) {
+ if ( s == **i ) {
+ v.push_back(*i);
+ old.erase(i);
+ return;
+ }
+ i++;
+ }
+ }
+
+ v.push_back( shared_ptr< ReplSource >( new ReplSource( s ) ) );
+ }
+
+ /* we reuse our existing objects so that we can keep our existing connection
+ and cursor in effect.
+ */
+ void ReplSource::loadAll(SourceVector &v) {
+ Client::Context ctx("local.sources");
+ SourceVector old = v;
+ v.clear();
+
+ if ( !cmdLine.source.empty() ) {
+ // --source <host> specified.
+ // check that no items are in sources other than that
+ // add if missing
+ shared_ptr<Cursor> c = findTableScan("local.sources", BSONObj());
+ int n = 0;
+ while ( c->ok() ) {
+ n++;
+ ReplSource tmp(c->current());
+ if ( tmp.hostName != cmdLine.source ) {
+ log() << "repl: --source " << cmdLine.source << " != " << tmp.hostName << " from local.sources collection" << endl;
+ log() << "repl: for instructions on changing this slave's source, see:" << endl;
+ log() << "http://dochub.mongodb.org/core/masterslave" << endl;
+ log() << "repl: terminating mongod after 30 seconds" << endl;
+ sleepsecs(30);
+ dbexit( EXIT_REPLICATION_ERROR );
+ }
+ if ( tmp.only != cmdLine.only ) {
+ log() << "--only " << cmdLine.only << " != " << tmp.only << " from local.sources collection" << endl;
+ log() << "terminating after 30 seconds" << endl;
+ sleepsecs(30);
+ dbexit( EXIT_REPLICATION_ERROR );
+ }
+ c->advance();
+ }
+ uassert( 10002 , "local.sources collection corrupt?", n<2 );
+ if ( n == 0 ) {
+ // source missing. add.
+ ReplSource s;
+ s.hostName = cmdLine.source;
+ s.only = cmdLine.only;
+ s.save();
+ }
+ }
+ else {
+ try {
+ massert( 10384 , "--only requires use of --source", cmdLine.only.empty());
+ }
+ catch ( ... ) {
+ dbexit( EXIT_BADOPTIONS );
+ }
+ }
+
+ shared_ptr<Cursor> c = findTableScan("local.sources", BSONObj());
+ while ( c->ok() ) {
+ ReplSource tmp(c->current());
+ if ( tmp.syncedTo.isNull() ) {
+ DBDirectClient c;
+ if ( c.exists( "local.oplog.$main" ) ) {
+ BSONObj op = c.findOne( "local.oplog.$main", QUERY( "op" << NE << "n" ).sort( BSON( "$natural" << -1 ) ) );
+ if ( !op.isEmpty() ) {
+ tmp.syncedTo = op[ "ts" ].date();
+ }
+ }
+ }
+ addSourceToList(v, tmp, old);
+ c->advance();
+ }
+ }
+
+ BSONObj opTimeQuery = fromjson("{\"getoptime\":1}");
+
+ bool ReplSource::throttledForceResyncDead( const char *requester ) {
+ if ( time( 0 ) - lastForcedResync > 600 ) {
+ forceResyncDead( requester );
+ lastForcedResync = time( 0 );
+ return true;
+ }
+ return false;
+ }
+
+ void ReplSource::forceResyncDead( const char *requester ) {
+ if ( !replAllDead )
+ return;
+ SourceVector sources;
+ ReplSource::loadAll(sources);
+ for( SourceVector::iterator i = sources.begin(); i != sources.end(); ++i ) {
+ log() << requester << " forcing resync from " << (*i)->hostName << endl;
+ (*i)->forceResync( requester );
+ }
+ replAllDead = 0;
+ }
+
+ void ReplSource::forceResync( const char *requester ) {
+ BSONObj info;
+ {
+ dbtemprelease t;
+ if (!oplogReader.connect(hostName)) {
+ msgassertedNoTrace( 14051 , "unable to connect to resync");
+ }
+ /* todo use getDatabaseNames() method here */
+ bool ok = oplogReader.conn()->runCommand( "admin", BSON( "listDatabases" << 1 ), info );
+ massert( 10385 , "Unable to get database list", ok );
+ }
+ BSONObjIterator i( info.getField( "databases" ).embeddedObject() );
+ while( i.moreWithEOO() ) {
+ BSONElement e = i.next();
+ if ( e.eoo() )
+ break;
+ string name = e.embeddedObject().getField( "name" ).valuestr();
+ if ( !e.embeddedObject().getBoolField( "empty" ) ) {
+ if ( name != "local" ) {
+ if ( only.empty() || only == name ) {
+ resyncDrop( name.c_str(), requester );
+ }
+ }
+ }
+ }
+ syncedTo = OpTime();
+ addDbNextPass.clear();
+ save();
+ }
+
+ string ReplSource::resyncDrop( const char *db, const char *requester ) {
+ log() << "resync: dropping database " << db << endl;
+ Client::Context ctx(db);
+ dropDatabase(db);
+ return db;
+ }
+
+ /* grab initial copy of a database from the master */
+ void ReplSource::resync(string db) {
+ string dummyNs = resyncDrop( db.c_str(), "internal" );
+ Client::Context ctx( dummyNs );
+ {
+ log() << "resync: cloning database " << db << " to get an initial copy" << endl;
+ ReplInfo r("resync: cloning a database");
+ string errmsg;
+ int errCode = 0;
+ bool ok = cloneFrom(hostName.c_str(), errmsg, cc().database()->name, false, /*slaveOk*/ true, /*replauth*/ true, /*snapshot*/false, /*mayYield*/true, /*mayBeInterrupted*/false, &errCode);
+ if ( !ok ) {
+ if ( errCode == DatabaseDifferCaseCode ) {
+ resyncDrop( db.c_str(), "internal" );
+ log() << "resync: database " << db << " not valid on the master due to a name conflict, dropping." << endl;
+ return;
+ }
+ else {
+ problem() << "resync of " << db << " from " << hostName << " failed " << errmsg << endl;
+ throw SyncException();
+ }
+ }
+ }
+
+ log() << "resync: done with initial clone for db: " << db << endl;
+
+ return;
+ }
+
+ DatabaseIgnorer ___databaseIgnorer;
+
+ void DatabaseIgnorer::doIgnoreUntilAfter( const string &db, const OpTime &futureOplogTime ) {
+ if ( futureOplogTime > _ignores[ db ] ) {
+ _ignores[ db ] = futureOplogTime;
+ }
+ }
+
+ bool DatabaseIgnorer::ignoreAt( const string &db, const OpTime &currentOplogTime ) {
+ if ( _ignores[ db ].isNull() ) {
+ return false;
+ }
+ if ( _ignores[ db ] >= currentOplogTime ) {
+ return true;
+ } else {
+ // The ignore state has expired, so clear it.
+ _ignores.erase( db );
+ return false;
+ }
+ }
+
+ bool ReplSource::handleDuplicateDbName( const BSONObj &op, const char *ns, const char *db ) {
+ if ( dbHolder()._isLoaded( ns, dbpath ) ) {
+ // Database is already present.
+ return true;
+ }
+ BSONElement ts = op.getField( "ts" );
+ if ( ( ts.type() == Date || ts.type() == Timestamp ) && ___databaseIgnorer.ignoreAt( db, ts.date() ) ) {
+ // Database is ignored due to a previous indication that it is
+ // missing from master after optime "ts".
+ return false;
+ }
+ if ( Database::duplicateUncasedName( false, db, dbpath ).empty() ) {
+ // No duplicate database names are present.
+ return true;
+ }
+
+ OpTime lastTime;
+ bool dbOk = false;
+ {
+ dbtemprelease release;
+
+ // We always log an operation after executing it (never before), so
+ // a database list will always be valid as of an oplog entry generated
+ // before it was retrieved.
+
+ BSONObj last = oplogReader.findOne( this->ns().c_str(), Query().sort( BSON( "$natural" << -1 ) ) );
+ if ( !last.isEmpty() ) {
+ BSONElement ts = last.getField( "ts" );
+ massert( 14032, "Invalid 'ts' in remote log", ts.type() == Date || ts.type() == Timestamp );
+ lastTime = OpTime( ts.date() );
+ }
+
+ BSONObj info;
+ bool ok = oplogReader.conn()->runCommand( "admin", BSON( "listDatabases" << 1 ), info );
+ massert( 14033, "Unable to get database list", ok );
+ BSONObjIterator i( info.getField( "databases" ).embeddedObject() );
+ while( i.more() ) {
+ BSONElement e = i.next();
+
+ const char * name = e.embeddedObject().getField( "name" ).valuestr();
+ if ( strcasecmp( name, db ) != 0 )
+ continue;
+
+ if ( strcmp( name, db ) == 0 ) {
+ // The db exists on master, still need to check that no conflicts exist there.
+ dbOk = true;
+ continue;
+ }
+
+ // The master has a db name that conflicts with the requested name.
+ dbOk = false;
+ break;
+ }
+ }
+
+ if ( !dbOk ) {
+ ___databaseIgnorer.doIgnoreUntilAfter( db, lastTime );
+ incompleteCloneDbs.erase(db);
+ addDbNextPass.erase(db);
+ return false;
+ }
+
+ // Check for duplicates again, since we released the lock above.
+ set< string > duplicates;
+ Database::duplicateUncasedName( false, db, dbpath, &duplicates );
+
+ // The database is present on the master and no conflicting databases
+ // are present on the master. Drop any local conflicts.
+ for( set< string >::const_iterator i = duplicates.begin(); i != duplicates.end(); ++i ) {
+ ___databaseIgnorer.doIgnoreUntilAfter( *i, lastTime );
+ incompleteCloneDbs.erase(*i);
+ addDbNextPass.erase(*i);
+ Client::Context ctx(*i);
+ dropDatabase(*i);
+ }
+
+ massert( 14034, "Duplicate database names present after attempting to delete duplicates",
+ Database::duplicateUncasedName( false, db, dbpath ).empty() );
+ return true;
+ }
+
+ void ReplSource::applyOperation(const BSONObj& op) {
+ try {
+ bool failedUpdate = applyOperation_inlock( op );
+ if (failedUpdate) {
+ Sync sync(hostName);
+ if (sync.shouldRetry(op)) {
+ uassert(15914, "Failure retrying initial sync update", !applyOperation_inlock(op));
+ }
+ }
+ }
+ catch ( UserException& e ) {
+ log() << "sync: caught user assertion " << e << " while applying op: " << op << endl;;
+ }
+ catch ( DBException& e ) {
+ log() << "sync: caught db exception " << e << " while applying op: " << op << endl;;
+ }
+
+ }
+
+ /* local.$oplog.main is of the form:
+ { ts: ..., op: <optype>, ns: ..., o: <obj> , o2: <extraobj>, b: <boolflag> }
+ ...
+ see logOp() comments.
+
+ @param alreadyLocked caller already put us in write lock if true
+ */
+ void ReplSource::sync_pullOpLog_applyOperation(BSONObj& op, bool alreadyLocked) {
+ if( logLevel >= 6 ) // op.tostring is expensive so doing this check explicitly
+ log(6) << "processing op: " << op << endl;
+
+ if( op.getStringField("op")[0] == 'n' )
+ return;
+
+ char clientName[MaxDatabaseNameLen];
+ const char *ns = op.getStringField("ns");
+ nsToDatabase(ns, clientName);
+
+ if ( *ns == '.' ) {
+ problem() << "skipping bad op in oplog: " << op.toString() << endl;
+ return;
+ }
+ else if ( *ns == 0 ) {
+ /*if( op.getStringField("op")[0] != 'n' )*/ {
+ problem() << "halting replication, bad op in oplog:\n " << op.toString() << endl;
+ replAllDead = "bad object in oplog";
+ throw SyncException();
+ }
+ //ns = "local.system.x";
+ //nsToDatabase(ns, clientName);
+ }
+
+ if ( !only.empty() && only != clientName )
+ return;
+
+ if( cmdLine.pretouch && !alreadyLocked/*doesn't make sense if in write lock already*/ ) {
+ if( cmdLine.pretouch > 1 ) {
+ /* note: this is bad - should be put in ReplSource. but this is first test... */
+ static int countdown;
+ assert( countdown >= 0 );
+ if( countdown > 0 ) {
+ countdown--; // was pretouched on a prev pass
+ }
+ else {
+ const int m = 4;
+ if( tp.get() == 0 ) {
+ int nthr = min(8, cmdLine.pretouch);
+ nthr = max(nthr, 1);
+ tp.reset( new ThreadPool(nthr) );
+ }
+ vector<BSONObj> v;
+ oplogReader.peek(v, cmdLine.pretouch);
+ unsigned a = 0;
+ while( 1 ) {
+ if( a >= v.size() ) break;
+ unsigned b = a + m - 1; // v[a..b]
+ if( b >= v.size() ) b = v.size() - 1;
+ tp->schedule(pretouchN, v, a, b);
+ DEV cout << "pretouch task: " << a << ".." << b << endl;
+ a += m;
+ }
+ // we do one too...
+ pretouchOperation(op);
+ tp->join();
+ countdown = v.size();
+ }
+ }
+ else {
+ pretouchOperation(op);
+ }
+ }
+
+ scoped_ptr<writelock> lk( alreadyLocked ? 0 : new writelock() );
+
+ if ( replAllDead ) {
+ // hmmm why is this check here and not at top of this function? does it get set between top and here?
+ log() << "replAllDead, throwing SyncException: " << replAllDead << endl;
+ throw SyncException();
+ }
+
+ if ( !handleDuplicateDbName( op, ns, clientName ) ) {
+ return;
+ }
+
+ Client::Context ctx( ns );
+ ctx.getClient()->curop()->reset();
+
+ bool empty = ctx.db()->isEmpty();
+ bool incompleteClone = incompleteCloneDbs.count( clientName ) != 0;
+
+ if( logLevel >= 6 )
+ log(6) << "ns: " << ns << ", justCreated: " << ctx.justCreated() << ", empty: " << empty << ", incompleteClone: " << incompleteClone << endl;
+
+ // always apply admin command command
+ // this is a bit hacky -- the semantics of replication/commands aren't well specified
+ if ( strcmp( clientName, "admin" ) == 0 && *op.getStringField( "op" ) == 'c' ) {
+ applyOperation( op );
+ return;
+ }
+
+ if ( ctx.justCreated() || empty || incompleteClone ) {
+ // we must add to incomplete list now that setClient has been called
+ incompleteCloneDbs.insert( clientName );
+ if ( nClonedThisPass ) {
+ /* we only clone one database per pass, even if a lot need done. This helps us
+ avoid overflowing the master's transaction log by doing too much work before going
+ back to read more transactions. (Imagine a scenario of slave startup where we try to
+ clone 100 databases in one pass.)
+ */
+ addDbNextPass.insert( clientName );
+ }
+ else {
+ if ( incompleteClone ) {
+ log() << "An earlier initial clone of '" << clientName << "' did not complete, now resyncing." << endl;
+ }
+ save();
+ Client::Context ctx(ns);
+ nClonedThisPass++;
+ resync(ctx.db()->name);
+ addDbNextPass.erase(clientName);
+ incompleteCloneDbs.erase( clientName );
+ }
+ save();
+ }
+ else {
+ applyOperation( op );
+ addDbNextPass.erase( clientName );
+ }
+ }
+
+ void ReplSource::syncToTailOfRemoteLog() {
+ string _ns = ns();
+ BSONObjBuilder b;
+ if ( !only.empty() ) {
+ b.appendRegex("ns", string("^") + only);
+ }
+ BSONObj last = oplogReader.findOne( _ns.c_str(), Query( b.done() ).sort( BSON( "$natural" << -1 ) ) );
+ if ( !last.isEmpty() ) {
+ BSONElement ts = last.getField( "ts" );
+ massert( 10386 , "non Date ts found: " + last.toString(), ts.type() == Date || ts.type() == Timestamp );
+ syncedTo = OpTime( ts.date() );
+ }
+ }
+
+ extern unsigned replApplyBatchSize;
+
+ /* slave: pull some data from the master's oplog
+ note: not yet in db mutex at this point.
+ @return -1 error
+ 0 ok, don't sleep
+ 1 ok, sleep
+ */
+ int ReplSource::sync_pullOpLog(int& nApplied) {
+ int okResultCode = 1;
+ string ns = string("local.oplog.$") + sourceName();
+ log(2) << "repl: sync_pullOpLog " << ns << " syncedTo:" << syncedTo.toStringLong() << '\n';
+
+ bool tailing = true;
+ oplogReader.tailCheck();
+
+ bool initial = syncedTo.isNull();
+
+ if ( !oplogReader.haveCursor() || initial ) {
+ if ( initial ) {
+ // Important to grab last oplog timestamp before listing databases.
+ syncToTailOfRemoteLog();
+ BSONObj info;
+ bool ok = oplogReader.conn()->runCommand( "admin", BSON( "listDatabases" << 1 ), info );
+ massert( 10389 , "Unable to get database list", ok );
+ BSONObjIterator i( info.getField( "databases" ).embeddedObject() );
+ while( i.moreWithEOO() ) {
+ BSONElement e = i.next();
+ if ( e.eoo() )
+ break;
+ string name = e.embeddedObject().getField( "name" ).valuestr();
+ if ( !e.embeddedObject().getBoolField( "empty" ) ) {
+ if ( name != "local" ) {
+ if ( only.empty() || only == name ) {
+ log( 2 ) << "adding to 'addDbNextPass': " << name << endl;
+ addDbNextPass.insert( name );
+ }
+ }
+ }
+ }
+ dblock lk;
+ save();
+ }
+
+ BSONObjBuilder q;
+ q.appendDate("$gte", syncedTo.asDate());
+ BSONObjBuilder query;
+ query.append("ts", q.done());
+ if ( !only.empty() ) {
+ // note we may here skip a LOT of data table scanning, a lot of work for the master.
+ query.appendRegex("ns", string("^") + only); // maybe append "\\." here?
+ }
+ BSONObj queryObj = query.done();
+ // e.g. queryObj = { ts: { $gte: syncedTo } }
+
+ oplogReader.tailingQuery(ns.c_str(), queryObj);
+ tailing = false;
+ }
+ else {
+ log(2) << "repl: tailing=true\n";
+ }
+
+ if( !oplogReader.haveCursor() ) {
+ problem() << "repl: dbclient::query returns null (conn closed?)" << endl;
+ oplogReader.resetConnection();
+ return -1;
+ }
+
+ // show any deferred database creates from a previous pass
+ {
+ set<string>::iterator i = addDbNextPass.begin();
+ if ( i != addDbNextPass.end() ) {
+ BSONObjBuilder b;
+ b.append("ns", *i + '.');
+ b.append("op", "db");
+ BSONObj op = b.done();
+ sync_pullOpLog_applyOperation(op, false);
+ }
+ }
+
+ if ( !oplogReader.more() ) {
+ if ( tailing ) {
+ log(2) << "repl: tailing & no new activity\n";
+ if( oplogReader.awaitCapable() )
+ okResultCode = 0; // don't sleep
+
+ }
+ else {
+ log() << "repl: " << ns << " oplog is empty\n";
+ }
+ {
+ dblock lk;
+ save();
+ }
+ return okResultCode;
+ }
+
+ OpTime nextOpTime;
+ {
+ BSONObj op = oplogReader.next();
+ BSONElement ts = op.getField("ts");
+ if ( ts.type() != Date && ts.type() != Timestamp ) {
+ string err = op.getStringField("$err");
+ if ( !err.empty() ) {
+ // 13051 is "tailable cursor requested on non capped collection"
+ if (op.getIntField("code") == 13051) {
+ problem() << "trying to slave off of a non-master" << '\n';
+ massert( 13344 , "trying to slave off of a non-master", false );
+ }
+ else {
+ problem() << "repl: $err reading remote oplog: " + err << '\n';
+ massert( 10390 , "got $err reading remote oplog", false );
+ }
+ }
+ else {
+ problem() << "repl: bad object read from remote oplog: " << op.toString() << '\n';
+ massert( 10391 , "repl: bad object read from remote oplog", false);
+ }
+ }
+
+ nextOpTime = OpTime( ts.date() );
+ log(2) << "repl: first op time received: " << nextOpTime.toString() << '\n';
+ if ( initial ) {
+ log(1) << "repl: initial run\n";
+ }
+ if( tailing ) {
+ if( !( syncedTo < nextOpTime ) ) {
+ log() << "repl ASSERTION failed : syncedTo < nextOpTime" << endl;
+ log() << "repl syncTo: " << syncedTo.toStringLong() << endl;
+ log() << "repl nextOpTime: " << nextOpTime.toStringLong() << endl;
+ assert(false);
+ }
+ oplogReader.putBack( op ); // op will be processed in the loop below
+ nextOpTime = OpTime(); // will reread the op below
+ }
+ else if ( nextOpTime != syncedTo ) { // didn't get what we queried for - error
+ Nullstream& l = log();
+ l << "repl: nextOpTime " << nextOpTime.toStringLong() << ' ';
+ if ( nextOpTime < syncedTo )
+ l << "<??";
+ else
+ l << ">";
+
+ l << " syncedTo " << syncedTo.toStringLong() << '\n';
+ log() << "repl: time diff: " << (nextOpTime.getSecs() - syncedTo.getSecs()) << "sec\n";
+ log() << "repl: tailing: " << tailing << '\n';
+ log() << "repl: data too stale, halting replication" << endl;
+ replInfo = replAllDead = "data too stale halted replication";
+ assert( syncedTo < nextOpTime );
+ throw SyncException();
+ }
+ else {
+ /* t == syncedTo, so the first op was applied previously or it is the first op of initial query and need not be applied. */
+ }
+ }
+
+ // apply operations
+ {
+ int n = 0;
+ time_t saveLast = time(0);
+ while ( 1 ) {
+
+ bool moreInitialSyncsPending = !addDbNextPass.empty() && n; // we need "&& n" to assure we actually process at least one op to get a sync point recorded in the first place.
+
+ if ( moreInitialSyncsPending || !oplogReader.more() ) {
+ dblock lk;
+
+ // NOTE aaron 2011-03-29 This block may be unnecessary, but I'm leaving it in place to avoid changing timing behavior.
+ {
+ dbtemprelease t;
+ if ( !moreInitialSyncsPending && oplogReader.more() ) {
+ continue;
+ }
+ // otherwise, break out of loop so we can set to completed or clone more dbs
+ }
+
+ if( oplogReader.awaitCapable() && tailing )
+ okResultCode = 0; // don't sleep
+ syncedTo = nextOpTime;
+ save(); // note how far we are synced up to now
+ log() << "repl: applied " << n << " operations" << endl;
+ nApplied = n;
+ log() << "repl: end sync_pullOpLog syncedTo: " << syncedTo.toStringLong() << endl;
+ break;
+ }
+ else {
+ }
+
+ OCCASIONALLY if( n > 0 && ( n > 100000 || time(0) - saveLast > 60 ) ) {
+ // periodically note our progress, in case we are doing a lot of work and crash
+ dblock lk;
+ syncedTo = nextOpTime;
+ // can't update local log ts since there are pending operations from our peer
+ save();
+ log() << "repl: checkpoint applied " << n << " operations" << endl;
+ log() << "repl: syncedTo: " << syncedTo.toStringLong() << endl;
+ saveLast = time(0);
+ n = 0;
+ }
+
+ BSONObj op = oplogReader.next();
+
+ unsigned b = replApplyBatchSize;
+ bool justOne = b == 1;
+ scoped_ptr<writelock> lk( justOne ? 0 : new writelock() );
+ while( 1 ) {
+
+ BSONElement ts = op.getField("ts");
+ if( !( ts.type() == Date || ts.type() == Timestamp ) ) {
+ log() << "sync error: problem querying remote oplog record" << endl;
+ log() << "op: " << op.toString() << endl;
+ log() << "halting replication" << endl;
+ replInfo = replAllDead = "sync error: no ts found querying remote oplog record";
+ throw SyncException();
+ }
+ OpTime last = nextOpTime;
+ nextOpTime = OpTime( ts.date() );
+ if ( !( last < nextOpTime ) ) {
+ log() << "sync error: last applied optime at slave >= nextOpTime from master" << endl;
+ log() << " last: " << last.toStringLong() << endl;
+ log() << " nextOpTime: " << nextOpTime.toStringLong() << endl;
+ log() << " halting replication" << endl;
+ replInfo = replAllDead = "sync error last >= nextOpTime";
+ uassert( 10123 , "replication error last applied optime at slave >= nextOpTime from master", false);
+ }
+ if ( replSettings.slavedelay && ( unsigned( time( 0 ) ) < nextOpTime.getSecs() + replSettings.slavedelay ) ) {
+ assert( justOne );
+ oplogReader.putBack( op );
+ _sleepAdviceTime = nextOpTime.getSecs() + replSettings.slavedelay + 1;
+ dblock lk;
+ if ( n > 0 ) {
+ syncedTo = last;
+ save();
+ }
+ log() << "repl: applied " << n << " operations" << endl;
+ log() << "repl: syncedTo: " << syncedTo.toStringLong() << endl;
+ log() << "waiting until: " << _sleepAdviceTime << " to continue" << endl;
+ return okResultCode;
+ }
+
+ sync_pullOpLog_applyOperation(op, !justOne);
+ n++;
+
+ if( --b == 0 )
+ break;
+ // if to here, we are doing mulpile applications in a singel write lock acquisition
+ if( !oplogReader.moreInCurrentBatch() ) {
+ // break if no more in batch so we release lock while reading from the master
+ break;
+ }
+ op = oplogReader.next();
+
+ getDur().commitIfNeeded();
+ }
+ }
+ }
+
+ return okResultCode;
+ }
+
+ BSONObj userReplQuery = fromjson("{\"user\":\"repl\"}");
+
+ bool replAuthenticate(DBClientBase *conn) {
+ if( noauth ) {
+ return true;
+ }
+ if( ! cc().isAdmin() ) {
+ log() << "replauthenticate: requires admin permissions, failing\n";
+ return false;
+ }
+
+ string u;
+ string p;
+ if (internalSecurity.pwd.length() > 0) {
+ u = internalSecurity.user;
+ p = internalSecurity.pwd;
+ }
+ else {
+ BSONObj user;
+ {
+ dblock lk;
+ Client::Context ctxt("local.");
+ if( !Helpers::findOne("local.system.users", userReplQuery, user) ||
+ // try the first user in local
+ !Helpers::getSingleton("local.system.users", user) ) {
+ log() << "replauthenticate: no user in local.system.users to use for authentication\n";
+ return false;
+ }
+ }
+ u = user.getStringField("user");
+ p = user.getStringField("pwd");
+ massert( 10392 , "bad user object? [1]", !u.empty());
+ massert( 10393 , "bad user object? [2]", !p.empty());
+ }
+
+ string err;
+ if( !conn->auth("local", u.c_str(), p.c_str(), err, false) ) {
+ log() << "replauthenticate: can't authenticate to master server, user:" << u << endl;
+ return false;
+ }
+ return true;
+ }
+
+ bool replHandshake(DBClientConnection *conn) {
+
+ string myname = getHostName();
+
+ BSONObj me;
+ {
+
+ dblock l;
+ // local.me is an identifier for a server for getLastError w:2+
+ if ( ! Helpers::getSingleton( "local.me" , me ) ||
+ ! me.hasField("host") ||
+ me["host"].String() != myname ) {
+
+ // clean out local.me
+ Helpers::emptyCollection("local.me");
+
+ // repopulate
+ BSONObjBuilder b;
+ b.appendOID( "_id" , 0 , true );
+ b.append( "host", myname );
+ me = b.obj();
+ Helpers::putSingleton( "local.me" , me );
+ }
+ }
+
+ BSONObjBuilder cmd;
+ cmd.appendAs( me["_id"] , "handshake" );
+ if (theReplSet) {
+ cmd.append("member", theReplSet->selfId());
+ }
+
+ BSONObj res;
+ bool ok = conn->runCommand( "admin" , cmd.obj() , res );
+ // ignoring for now on purpose for older versions
+ log(ok) << "replHandshake res not: " << ok << " res: " << res << endl;
+ return true;
+ }
+
+ bool OplogReader::commonConnect(const string& hostName) {
+ if( conn() == 0 ) {
+ _conn = shared_ptr<DBClientConnection>(new DBClientConnection( false, 0, 0 /* tcp timeout */));
+ string errmsg;
+ ReplInfo r("trying to connect to sync source");
+ if ( !_conn->connect(hostName.c_str(), errmsg) ||
+ (!noauth && !replAuthenticate(_conn.get())) ) {
+ resetConnection();
+ log() << "repl: " << errmsg << endl;
+ return false;
+ }
+ }
+ return true;
+ }
+
+ bool OplogReader::connect(string hostName) {
+ if (conn() != 0) {
+ return true;
+ }
+
+ if (commonConnect(hostName)) {
+ return replHandshake(_conn.get());
+ }
+ return false;
+ }
+
+ bool OplogReader::connect(const BSONObj& rid, const int from, const string& to) {
+ if (conn() != 0) {
+ return true;
+ }
+ if (commonConnect(to)) {
+ log() << "handshake between " << from << " and " << to << endl;
+ return passthroughHandshake(rid, from);
+ }
+ return false;
+ }
+
+ bool OplogReader::passthroughHandshake(const BSONObj& rid, const int f) {
+ BSONObjBuilder cmd;
+ cmd.appendAs( rid["_id"], "handshake" );
+ cmd.append( "member" , f );
+
+ BSONObj res;
+ return conn()->runCommand( "admin" , cmd.obj() , res );
+ }
+
+ /* note: not yet in mutex at this point.
+ returns >= 0 if ok. return -1 if you want to reconnect.
+ return value of zero indicates no sleep necessary before next call
+ */
+ int ReplSource::sync(int& nApplied) {
+ _sleepAdviceTime = 0;
+ ReplInfo r("sync");
+ if ( !cmdLine.quiet ) {
+ Nullstream& l = log();
+ l << "repl: syncing from ";
+ if( sourceName() != "main" ) {
+ l << "source:" << sourceName() << ' ';
+ }
+ l << "host:" << hostName << endl;
+ }
+ nClonedThisPass = 0;
+
+ // FIXME Handle cases where this db isn't on default port, or default port is spec'd in hostName.
+ if ( (string("localhost") == hostName || string("127.0.0.1") == hostName) && cmdLine.port == CmdLine::DefaultDBPort ) {
+ log() << "repl: can't sync from self (localhost). sources configuration may be wrong." << endl;
+ sleepsecs(5);
+ return -1;
+ }
+
+ if ( !oplogReader.connect(hostName) ) {
+ log(4) << "repl: can't connect to sync source" << endl;
+ return -1;
+ }
+
+ /*
+ // get current mtime at the server.
+ BSONObj o = conn->findOne("admin.$cmd", opTimeQuery);
+ BSONElement e = o.getField("optime");
+ if( e.eoo() ) {
+ log() << "repl: failed to get cur optime from master" << endl;
+ log() << " " << o.toString() << endl;
+ return false;
+ }
+ uassert( 10124 , e.type() == Date );
+ OpTime serverCurTime;
+ serverCurTime.asDate() = e.date();
+ */
+ return sync_pullOpLog(nApplied);
+ }
+
+ /* --------------------------------------------------------------*/
+
+ /*
+ TODO:
+ _ source has autoptr to the cursor
+ _ reuse that cursor when we can
+ */
+
+ /* returns: # of seconds to sleep before next pass
+ 0 = no sleep recommended
+ 1 = special sentinel indicating adaptive sleep recommended
+ */
+ int _replMain(ReplSource::SourceVector& sources, int& nApplied) {
+ {
+ ReplInfo r("replMain load sources");
+ dblock lk;
+ ReplSource::loadAll(sources);
+ replSettings.fastsync = false; // only need this param for initial reset
+ }
+
+ if ( sources.empty() ) {
+ /* replication is not configured yet (for --slave) in local.sources. Poll for config it
+ every 20 seconds.
+ */
+ log() << "no source given, add a master to local.sources to start replication" << endl;
+ return 20;
+ }
+
+ int sleepAdvice = 1;
+ for ( ReplSource::SourceVector::iterator i = sources.begin(); i != sources.end(); i++ ) {
+ ReplSource *s = i->get();
+ int res = -1;
+ try {
+ res = s->sync(nApplied);
+ bool moreToSync = s->haveMoreDbsToSync();
+ if( res < 0 ) {
+ sleepAdvice = 3;
+ }
+ else if( moreToSync ) {
+ sleepAdvice = 0;
+ }
+ else if ( s->sleepAdvice() ) {
+ sleepAdvice = s->sleepAdvice();
+ }
+ else
+ sleepAdvice = res;
+ }
+ catch ( const SyncException& ) {
+ log() << "caught SyncException" << endl;
+ return 10;
+ }
+ catch ( AssertionException& e ) {
+ if ( e.severe() ) {
+ log() << "replMain AssertionException " << e.what() << endl;
+ return 60;
+ }
+ else {
+ log() << "repl: AssertionException " << e.what() << '\n';
+ }
+ replInfo = "replMain caught AssertionException";
+ }
+ catch ( const DBException& e ) {
+ log() << "repl: DBException " << e.what() << endl;
+ replInfo = "replMain caught DBException";
+ }
+ catch ( const std::exception &e ) {
+ log() << "repl: std::exception " << e.what() << endl;
+ replInfo = "replMain caught std::exception";
+ }
+ catch ( ... ) {
+ log() << "unexpected exception during replication. replication will halt" << endl;
+ replAllDead = "caught unexpected exception during replication";
+ }
+ if ( res < 0 )
+ s->oplogReader.resetConnection();
+ }
+ return sleepAdvice;
+ }
+
+ void replMain() {
+ ReplSource::SourceVector sources;
+ while ( 1 ) {
+ int s = 0;
+ {
+ dblock lk;
+ if ( replAllDead ) {
+ // throttledForceResyncDead can throw
+ if ( !replSettings.autoresync || !ReplSource::throttledForceResyncDead( "auto" ) ) {
+ log() << "all sources dead: " << replAllDead << ", sleeping for 5 seconds" << endl;
+ break;
+ }
+ }
+ assert( syncing == 0 ); // i.e., there is only one sync thread running. we will want to change/fix this.
+ syncing++;
+ }
+ try {
+ int nApplied = 0;
+ s = _replMain(sources, nApplied);
+ if( s == 1 ) {
+ if( nApplied == 0 ) s = 2;
+ else if( nApplied > 100 ) {
+ // sleep very little - just enought that we aren't truly hammering master
+ sleepmillis(75);
+ s = 0;
+ }
+ }
+ }
+ catch (...) {
+ out() << "caught exception in _replMain" << endl;
+ s = 4;
+ }
+ {
+ dblock lk;
+ assert( syncing == 1 );
+ syncing--;
+ }
+
+ if( relinquishSyncingSome ) {
+ relinquishSyncingSome = 0;
+ s = 1; // sleep before going back in to syncing=1
+ }
+
+ if ( s ) {
+ stringstream ss;
+ ss << "repl: sleep " << s << " sec before next pass";
+ string msg = ss.str();
+ if ( ! cmdLine.quiet )
+ log() << msg << endl;
+ ReplInfo r(msg.c_str());
+ sleepsecs(s);
+ }
+ }
+ }
+
+ static void replMasterThread() {
+ sleepsecs(4);
+ Client::initThread("replmaster");
+ int toSleep = 10;
+ while( 1 ) {
+
+ sleepsecs( toSleep );
+ /* write a keep-alive like entry to the log. this will make things like
+ printReplicationStatus() and printSlaveReplicationStatus() stay up-to-date
+ even when things are idle.
+ */
+ {
+ writelocktry lk("",1);
+ if ( lk.got() ) {
+ toSleep = 10;
+
+ replLocalAuth();
+
+ try {
+ logKeepalive();
+ }
+ catch(...) {
+ log() << "caught exception in replMasterThread()" << endl;
+ }
+ }
+ else {
+ log(5) << "couldn't logKeepalive" << endl;
+ toSleep = 1;
+ }
+ }
+ }
+ }
+
+ void replSlaveThread() {
+ sleepsecs(1);
+ Client::initThread("replslave");
+ cc().iAmSyncThread();
+
+ {
+ dblock lk;
+ replLocalAuth();
+ }
+
+ while ( 1 ) {
+ try {
+ replMain();
+ sleepsecs(5);
+ }
+ catch ( AssertionException& ) {
+ ReplInfo r("Assertion in replSlaveThread(): sleeping 5 minutes before retry");
+ problem() << "Assertion in replSlaveThread(): sleeping 5 minutes before retry" << endl;
+ sleepsecs(300);
+ }
+ catch ( DBException& e ) {
+ problem() << "exception in replSlaveThread(): " << e.what()
+ << ", sleeping 5 minutes before retry" << endl;
+ sleepsecs(300);
+ }
+ catch ( ... ) {
+ problem() << "error in replSlaveThread(): sleeping 5 minutes before retry" << endl;
+ sleepsecs(300);
+ }
+ }
+ }
+
+ void tempThread() {
+ while ( 1 ) {
+ out() << d.dbMutex.info().isLocked() << endl;
+ sleepmillis(100);
+ }
+ }
+
+ void newRepl();
+ void oldRepl();
+ void startReplSets(ReplSetCmdline*);
+ void startReplication() {
+ /* if we are going to be a replica set, we aren't doing other forms of replication. */
+ if( !cmdLine._replSet.empty() ) {
+ if( replSettings.slave || replSettings.master ) {
+ log() << "***" << endl;
+ log() << "ERROR: can't use --slave or --master replication options with --replSet" << endl;
+ log() << "***" << endl;
+ }
+ newRepl();
+
+ replSet = true;
+ ReplSetCmdline *replSetCmdline = new ReplSetCmdline(cmdLine._replSet);
+ boost::thread t( boost::bind( &startReplSets, replSetCmdline) );
+
+ return;
+ }
+
+ oldRepl();
+
+ /* this was just to see if anything locks for longer than it should -- we need to be careful
+ not to be locked when trying to connect() or query() the other side.
+ */
+ //boost::thread tempt(tempThread);
+
+ if( !replSettings.slave && !replSettings.master )
+ return;
+
+ {
+ dblock lk;
+ replLocalAuth();
+ }
+
+ if ( replSettings.slave ) {
+ assert( replSettings.slave == SimpleSlave );
+ log(1) << "slave=true" << endl;
+ boost::thread repl_thread(replSlaveThread);
+ }
+
+ if ( replSettings.master ) {
+ log(1) << "master=true" << endl;
+ replSettings.master = true;
+ createOplog();
+ boost::thread t(replMasterThread);
+ }
+
+ while( replSettings.fastsync ) // don't allow writes until we've set up from log
+ sleepmillis( 50 );
+ }
+
+ void testPretouch() {
+ int nthr = min(8, 8);
+ nthr = max(nthr, 1);
+ int m = 8 / nthr;
+ ThreadPool tp(nthr);
+ vector<BSONObj> v;
+
+ BSONObj x = BSON( "ns" << "test.foo" << "o" << BSON( "_id" << 1 ) << "op" << "i" );
+
+ v.push_back(x);
+ v.push_back(x);
+ v.push_back(x);
+
+ unsigned a = 0;
+ while( 1 ) {
+ if( a >= v.size() ) break;
+ unsigned b = a + m - 1; // v[a..b]
+ if( b >= v.size() ) b = v.size() - 1;
+ tp.schedule(pretouchN, v, a, b);
+ DEV cout << "pretouch task: " << a << ".." << b << endl;
+ a += m;
+ }
+ tp.join();
+ }
+
+ class ReplApplyBatchSizeValidator : public ParameterValidator {
+ public:
+ ReplApplyBatchSizeValidator() : ParameterValidator( "replApplyBatchSize" ) {}
+
+ virtual bool isValid( BSONElement e , string& errmsg ) const {
+ int b = e.numberInt();
+ if( b < 1 || b > 1024 ) {
+ errmsg = "replApplyBatchSize has to be >= 1 and < 1024";
+ return false;
+ }
+
+ if ( replSettings.slavedelay != 0 && b > 1 ) {
+ errmsg = "can't use a batch size > 1 with slavedelay";
+ return false;
+ }
+ if ( ! replSettings.slave ) {
+ errmsg = "can't set replApplyBatchSize on a non-slave machine";
+ return false;
+ }
+
+ return true;
+ }
+ } replApplyBatchSizeValidator;
+
+} // namespace mongo
diff --git a/src/mongo/db/repl.h b/src/mongo/db/repl.h
new file mode 100644
index 00000000000..83242d0a4ce
--- /dev/null
+++ b/src/mongo/db/repl.h
@@ -0,0 +1,199 @@
+// repl.h - replication
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* replication data overview
+
+ at the slave:
+ local.sources { host: ..., source: ..., only: ..., syncedTo: ..., localLogTs: ..., dbsNextPass: { ... }, incompleteCloneDbs: { ... } }
+
+ at the master:
+ local.oplog.$<source>
+*/
+
+#pragma once
+
+#include "pdfile.h"
+#include "db.h"
+#include "dbhelpers.h"
+#include "../client/dbclient.h"
+#include "../util/optime.h"
+#include "oplog.h"
+#include "../util/concurrency/thread_pool.h"
+#include "oplogreader.h"
+#include "cloner.h"
+
+namespace mongo {
+
+ /* replication slave? (possibly with slave)
+ --slave cmd line setting -> SimpleSlave
+ */
+ typedef enum { NotSlave=0, SimpleSlave } SlaveTypes;
+
+ class ReplSettings {
+ public:
+ SlaveTypes slave;
+
+ /** true means we are master and doing replication. if we are not writing to oplog, this won't be true. */
+ bool master;
+
+ bool fastsync;
+
+ bool autoresync;
+
+ int slavedelay;
+
+ set<string> discoveredSeeds;
+ mutex discoveredSeeds_mx;
+
+ BSONObj reconfig;
+
+ ReplSettings()
+ : slave(NotSlave),
+ master(false),
+ fastsync(),
+ autoresync(false),
+ slavedelay(),
+ discoveredSeeds(),
+ discoveredSeeds_mx("ReplSettings::discoveredSeeds") {
+ }
+
+ };
+
+ extern ReplSettings replSettings;
+
+ /* A replication exception */
+ class SyncException : public DBException {
+ public:
+ SyncException() : DBException( "sync exception" , 10001 ) {}
+ };
+
+ /* A Source is a source from which we can pull (replicate) data.
+ stored in collection local.sources.
+
+ Can be a group of things to replicate for several databases.
+
+ { host: ..., source: ..., only: ..., syncedTo: ..., dbsNextPass: { ... }, incompleteCloneDbs: { ... } }
+
+ 'source' defaults to 'main'; support for multiple source names is
+ not done (always use main for now).
+ */
+ class ReplSource {
+ shared_ptr<ThreadPool> tp;
+
+ void resync(string db);
+
+ /** @param alreadyLocked caller already put us in write lock if true */
+ void sync_pullOpLog_applyOperation(BSONObj& op, bool alreadyLocked);
+
+ /* pull some operations from the master's oplog, and apply them.
+ calls sync_pullOpLog_applyOperation
+ */
+ int sync_pullOpLog(int& nApplied);
+
+ /* we only clone one database per pass, even if a lot need done. This helps us
+ avoid overflowing the master's transaction log by doing too much work before going
+ back to read more transactions. (Imagine a scenario of slave startup where we try to
+ clone 100 databases in one pass.)
+ */
+ set<string> addDbNextPass;
+
+ set<string> incompleteCloneDbs;
+
+ ReplSource();
+
+ // returns the dummy ns used to do the drop
+ string resyncDrop( const char *db, const char *requester );
+ // call without the db mutex
+ void syncToTailOfRemoteLog();
+ string ns() const { return string( "local.oplog.$" ) + sourceName(); }
+ unsigned _sleepAdviceTime;
+
+ /**
+ * If 'db' is a new database and its name would conflict with that of
+ * an existing database, synchronize these database names with the
+ * master.
+ * @return true iff an op with the specified ns may be applied.
+ */
+ bool handleDuplicateDbName( const BSONObj &op, const char *ns, const char *db );
+
+ public:
+ OplogReader oplogReader;
+
+ void applyOperation(const BSONObj& op);
+ string hostName; // ip addr or hostname plus optionally, ":<port>"
+ string _sourceName; // a logical source name.
+ string sourceName() const { return _sourceName.empty() ? "main" : _sourceName; }
+ string only; // only a certain db. note that in the sources collection, this may not be changed once you start replicating.
+
+ /* the last time point we have already synced up to (in the remote/master's oplog). */
+ OpTime syncedTo;
+
+ int nClonedThisPass;
+
+ typedef vector< shared_ptr< ReplSource > > SourceVector;
+ static void loadAll(SourceVector&);
+ explicit ReplSource(BSONObj);
+
+ /* -1 = error */
+ int sync(int& nApplied);
+
+ void save(); // write ourself to local.sources
+
+ // make a jsobj from our member fields of the form
+ // { host: ..., source: ..., syncedTo: ... }
+ BSONObj jsobj();
+
+ bool operator==(const ReplSource&r) const {
+ return hostName == r.hostName && sourceName() == r.sourceName();
+ }
+ string toString() const { return sourceName() + "@" + hostName; }
+
+ bool haveMoreDbsToSync() const { return !addDbNextPass.empty(); }
+ int sleepAdvice() const {
+ if ( !_sleepAdviceTime )
+ return 0;
+ int wait = _sleepAdviceTime - unsigned( time( 0 ) );
+ return wait > 0 ? wait : 0;
+ }
+
+ static bool throttledForceResyncDead( const char *requester );
+ static void forceResyncDead( const char *requester );
+ void forceResync( const char *requester );
+ };
+
+ bool anyReplEnabled();
+ void appendReplicationInfo( BSONObjBuilder& result , bool authed , int level = 0 );
+
+ /**
+ * Helper class used to set and query an ignore state for a named database.
+ * The ignore state will expire after a specified OpTime.
+ */
+ class DatabaseIgnorer {
+ public:
+ /** Indicate that operations for 'db' should be ignored until after 'futureOplogTime' */
+ void doIgnoreUntilAfter( const string &db, const OpTime &futureOplogTime );
+ /**
+ * Query ignore state of 'db'; if 'currentOplogTime' is after the ignore
+ * limit, the ignore state will be cleared.
+ */
+ bool ignoreAt( const string &db, const OpTime &currentOplogTime );
+ private:
+ map< string, OpTime > _ignores;
+ };
+
+} // namespace mongo
diff --git a/src/mongo/db/repl/connections.h b/src/mongo/db/repl/connections.h
new file mode 100644
index 00000000000..3e08f80b047
--- /dev/null
+++ b/src/mongo/db/repl/connections.h
@@ -0,0 +1,128 @@
+// @file
+
+/*
+ * Copyright (C) 2010 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <map>
+#include "../../client/dbclient.h"
+#include "../security_common.h"
+
+namespace mongo {
+
+ /** here we keep a single connection (with reconnect) for a set of hosts,
+ one each, and allow one user at a time per host. if in use already for that
+ host, we block. so this is an easy way to keep a 1-deep pool of connections
+ that many threads can share.
+
+ thread-safe.
+
+ Example:
+ {
+ ScopedConn c("foo.acme.com:9999");
+ c->runCommand(...);
+ }
+
+ throws exception on connect error (but fine to try again later with a new
+ scopedconn object for same host).
+ */
+ class ScopedConn {
+ public:
+ /** throws assertions if connect failure etc. */
+ ScopedConn(string hostport);
+ ~ScopedConn() {
+ // conLock releases...
+ }
+ void reconnect() {
+ conn()->port().shutdown();
+ connect();
+ }
+
+ /* If we were to run a query and not exhaust the cursor, future use of the connection would be problematic.
+ So here what we do is wrapper known safe methods and not allow cursor-style queries at all. This makes
+ ScopedConn limited in functionality but very safe. More non-cursor wrappers can be added here if needed.
+ */
+ bool runCommand(const string &dbname, const BSONObj& cmd, BSONObj &info, int options=0) {
+ return conn()->runCommand(dbname, cmd, info, options);
+ }
+ unsigned long long count(const string &ns) {
+ return conn()->count(ns);
+ }
+ BSONObj findOne(const string &ns, const Query& q, const BSONObj *fieldsToReturn = 0, int queryOptions = 0) {
+ return conn()->findOne(ns, q, fieldsToReturn, queryOptions);
+ }
+
+ private:
+ auto_ptr<scoped_lock> connLock;
+ static mongo::mutex mapMutex;
+ struct X {
+ mongo::mutex z;
+ DBClientConnection cc;
+ bool connected;
+ X() : z("X"), cc(/*reconnect*/ true, 0, /*timeout*/ 10.0), connected(false) {
+ cc._logLevel = 2;
+ }
+ } *x;
+ typedef map<string,ScopedConn::X*> M;
+ static M& _map;
+ DBClientConnection* conn() { return &x->cc; }
+ const string _hostport;
+
+ // we should already be locked...
+ bool connect() {
+ string err;
+ if (!x->cc.connect(_hostport, err)) {
+ log() << "couldn't connect to " << _hostport << ": " << err << rsLog;
+ return false;
+ }
+ x->connected = true;
+
+ // if we cannot authenticate against a member, then either its key file
+ // or our key file has to change. if our key file has to change, we'll
+ // be rebooting. if their file has to change, they'll be rebooted so the
+ // connection created above will go dead, reconnect, and reauth.
+ if (!noauth && !x->cc.auth("local", internalSecurity.user, internalSecurity.pwd, err, false)) {
+ log() << "could not authenticate against " << _hostport << ", " << err << rsLog;
+ return false;
+ }
+
+ return true;
+ }
+ };
+
+ inline ScopedConn::ScopedConn(string hostport) : _hostport(hostport) {
+ bool first = false;
+ {
+ scoped_lock lk(mapMutex);
+ x = _map[_hostport];
+ if( x == 0 ) {
+ x = _map[_hostport] = new X();
+ first = true;
+ connLock.reset( new scoped_lock(x->z) );
+ }
+ }
+
+ // Keep trying to connect if we're not yet connected
+ if( !first && x->connected ) {
+ connLock.reset( new scoped_lock(x->z) );
+ return;
+ }
+
+ connect();
+ }
+
+}
diff --git a/src/mongo/db/repl/consensus.cpp b/src/mongo/db/repl/consensus.cpp
new file mode 100644
index 00000000000..3995373f5ef
--- /dev/null
+++ b/src/mongo/db/repl/consensus.cpp
@@ -0,0 +1,449 @@
+/**
+* Copyright (C) 2010 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../commands.h"
+#include "rs.h"
+#include "multicmd.h"
+
+namespace mongo {
+
+ class CmdReplSetFresh : public ReplSetCommand {
+ public:
+ CmdReplSetFresh() : ReplSetCommand("replSetFresh") { }
+ private:
+
+ bool shouldVeto(const BSONObj& cmdObj, string& errmsg) {
+ unsigned id = cmdObj["id"].Int();
+ const Member* primary = theReplSet->box.getPrimary();
+ const Member* hopeful = theReplSet->findById(id);
+ const Member *highestPriority = theReplSet->getMostElectable();
+
+ if( !hopeful ) {
+ errmsg = str::stream() << "replSet couldn't find member with id " << id;
+ return true;
+ }
+ else if( theReplSet->isPrimary() && theReplSet->lastOpTimeWritten >= hopeful->hbinfo().opTime ) {
+ // hbinfo is not updated, so we have to check the primary's last optime separately
+ errmsg = str::stream() << "I am already primary, " << hopeful->fullName() <<
+ " can try again once I've stepped down";
+ return true;
+ }
+ else if( primary && primary->hbinfo().opTime >= hopeful->hbinfo().opTime ) {
+ // other members might be aware of more up-to-date nodes
+ errmsg = str::stream() << hopeful->fullName() << " is trying to elect itself but " <<
+ primary->fullName() << " is already primary and more up-to-date";
+ return true;
+ }
+ else if( highestPriority && highestPriority->config().priority > hopeful->config().priority) {
+ errmsg = str::stream() << hopeful->fullName() << " has lower priority than " << highestPriority->fullName();
+ return true;
+ }
+
+ // don't veto older versions
+ if (cmdObj["id"].eoo()) {
+ // they won't be looking for the veto field
+ return false;
+ }
+
+ if ( !theReplSet->isElectable(id) ||
+ (highestPriority && highestPriority->config().priority > hopeful->config().priority)) {
+ return true;
+ }
+
+ return false;
+ }
+
+ virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ if( !check(errmsg, result) )
+ return false;
+
+ if( cmdObj["set"].String() != theReplSet->name() ) {
+ errmsg = "wrong repl set name";
+ return false;
+ }
+ string who = cmdObj["who"].String();
+ int cfgver = cmdObj["cfgver"].Int();
+ OpTime opTime(cmdObj["opTime"].Date());
+
+ bool weAreFresher = false;
+ if( theReplSet->config().version > cfgver ) {
+ log() << "replSet member " << who << " is not yet aware its cfg version " << cfgver << " is stale" << rsLog;
+ result.append("info", "config version stale");
+ weAreFresher = true;
+ }
+ // check not only our own optime, but any other member we can reach
+ else if( opTime < theReplSet->lastOpTimeWritten ||
+ opTime < theReplSet->lastOtherOpTime()) {
+ weAreFresher = true;
+ }
+ result.appendDate("opTime", theReplSet->lastOpTimeWritten.asDate());
+ result.append("fresher", weAreFresher);
+ result.append("veto", shouldVeto(cmdObj, errmsg));
+
+ return true;
+ }
+ } cmdReplSetFresh;
+
+ class CmdReplSetElect : public ReplSetCommand {
+ public:
+ CmdReplSetElect() : ReplSetCommand("replSetElect") { }
+ private:
+ virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ if( !check(errmsg, result) )
+ return false;
+ theReplSet->elect.electCmdReceived(cmdObj, &result);
+ return true;
+ }
+ } cmdReplSetElect;
+
+ int Consensus::totalVotes() const {
+ static int complain = 0;
+ int vTot = rs._self->config().votes;
+ for( Member *m = rs.head(); m; m=m->next() )
+ vTot += m->config().votes;
+ if( vTot % 2 == 0 && vTot && complain++ == 0 )
+ log() << "replSet " /*buildbot! warning */ "total number of votes is even - add arbiter or give one member an extra vote" << rsLog;
+ return vTot;
+ }
+
+ bool Consensus::aMajoritySeemsToBeUp() const {
+ int vUp = rs._self->config().votes;
+ for( Member *m = rs.head(); m; m=m->next() )
+ vUp += m->hbinfo().up() ? m->config().votes : 0;
+ return vUp * 2 > totalVotes();
+ }
+
+ bool Consensus::shouldRelinquish() const {
+ int vUp = rs._self->config().votes;
+ const long long T = rs.config().ho.heartbeatTimeoutMillis * rs.config().ho.heartbeatConnRetries;
+ for( Member *m = rs.head(); m; m=m->next() ) {
+ long long dt = m->hbinfo().timeDown();
+ if( dt < T )
+ vUp += m->config().votes;
+ }
+
+ // the manager will handle calling stepdown if another node should be
+ // primary due to priority
+
+ return !( vUp * 2 > totalVotes() );
+ }
+
+ static const int VETO = -10000;
+
+ const time_t LeaseTime = 30;
+
+ SimpleMutex Consensus::lyMutex("ly");
+
+ unsigned Consensus::yea(unsigned memberId) { /* throws VoteException */
+ SimpleMutex::scoped_lock lk(lyMutex);
+ LastYea &L = this->ly.ref(lk);
+ time_t now = time(0);
+ if( L.when + LeaseTime >= now && L.who != memberId ) {
+ LOG(1) << "replSet not voting yea for " << memberId <<
+ " voted for " << L.who << ' ' << now-L.when << " secs ago" << rsLog;
+ throw VoteException();
+ }
+ L.when = now;
+ L.who = memberId;
+ return rs._self->config().votes;
+ }
+
+ /* we vote for ourself at start of election. once it fails, we can cancel the lease we had in
+ place instead of leaving it for a long time.
+ */
+ void Consensus::electionFailed(unsigned meid) {
+ SimpleMutex::scoped_lock lk(lyMutex);
+ LastYea &L = ly.ref(lk);
+ DEV assert( L.who == meid ); // this may not always always hold, so be aware, but adding for now as a quick sanity test
+ if( L.who == meid )
+ L.when = 0;
+ }
+
+ /* todo: threading **************** !!!!!!!!!!!!!!!! */
+ void Consensus::electCmdReceived(BSONObj cmd, BSONObjBuilder* _b) {
+ BSONObjBuilder& b = *_b;
+ DEV log() << "replSet received elect msg " << cmd.toString() << rsLog;
+ else LOG(2) << "replSet received elect msg " << cmd.toString() << rsLog;
+ string set = cmd["set"].String();
+ unsigned whoid = cmd["whoid"].Int();
+ int cfgver = cmd["cfgver"].Int();
+ OID round = cmd["round"].OID();
+ int myver = rs.config().version;
+
+ const Member* primary = rs.box.getPrimary();
+ const Member* hopeful = rs.findById(whoid);
+ const Member* highestPriority = rs.getMostElectable();
+
+ int vote = 0;
+ if( set != rs.name() ) {
+ log() << "replSet error received an elect request for '" << set << "' but our set name is '" << rs.name() << "'" << rsLog;
+ }
+ else if( myver < cfgver ) {
+ // we are stale. don't vote
+ }
+ else if( myver > cfgver ) {
+ // they are stale!
+ log() << "replSet electCmdReceived info got stale version # during election" << rsLog;
+ vote = -10000;
+ }
+ else if( !hopeful ) {
+ log() << "replSet electCmdReceived couldn't find member with id " << whoid << rsLog;
+ vote = -10000;
+ }
+ else if( primary && primary == rs._self && rs.lastOpTimeWritten >= hopeful->hbinfo().opTime ) {
+ // hbinfo is not updated, so we have to check the primary's last optime separately
+ log() << "I am already primary, " << hopeful->fullName()
+ << " can try again once I've stepped down" << rsLog;
+ vote = -10000;
+ }
+ else if( primary && primary->hbinfo().opTime >= hopeful->hbinfo().opTime ) {
+ // other members might be aware of more up-to-date nodes
+ log() << hopeful->fullName() << " is trying to elect itself but " <<
+ primary->fullName() << " is already primary and more up-to-date" << rsLog;
+ vote = -10000;
+ }
+ else if( highestPriority && highestPriority->config().priority > hopeful->config().priority) {
+ log() << hopeful->fullName() << " has lower priority than " << highestPriority->fullName();
+ vote = -10000;
+ }
+ else {
+ try {
+ vote = yea(whoid);
+ dassert( hopeful->id() == whoid );
+ rs.relinquish();
+ log() << "replSet info voting yea for " << hopeful->fullName() << " (" << whoid << ')' << rsLog;
+ }
+ catch(VoteException&) {
+ log() << "replSet voting no for " << hopeful->fullName() << " already voted for another" << rsLog;
+ }
+ }
+
+ b.append("vote", vote);
+ b.append("round", round);
+ }
+
+ void ReplSetImpl::_getTargets(list<Target>& L, int& configVersion) {
+ configVersion = config().version;
+ for( Member *m = head(); m; m=m->next() )
+ if( m->hbinfo().maybeUp() )
+ L.push_back( Target(m->fullName()) );
+ }
+
+ /* config version is returned as it is ok to use this unlocked. BUT, if unlocked, you would need
+ to check later that the config didn't change. */
+ void ReplSetImpl::getTargets(list<Target>& L, int& configVersion) {
+ if( lockedByMe() ) {
+ _getTargets(L, configVersion);
+ return;
+ }
+ lock lk(this);
+ _getTargets(L, configVersion);
+ }
+
+ /* Do we have the newest data of them all?
+ @param allUp - set to true if all members are up. Only set if true returned.
+ @return true if we are freshest. Note we may tie.
+ */
+ bool Consensus::weAreFreshest(bool& allUp, int& nTies) {
+ const OpTime ord = theReplSet->lastOpTimeWritten;
+ nTies = 0;
+ assert( !ord.isNull() );
+ BSONObj cmd = BSON(
+ "replSetFresh" << 1 <<
+ "set" << rs.name() <<
+ "opTime" << Date_t(ord.asDate()) <<
+ "who" << rs._self->fullName() <<
+ "cfgver" << rs._cfg->version <<
+ "id" << rs._self->id());
+ list<Target> L;
+ int ver;
+ /* the following queries arbiters, even though they are never fresh. wonder if that makes sense.
+ it doesn't, but it could, if they "know" what freshness it one day. so consider removing
+ arbiters from getTargets() here. although getTargets is used elsewhere for elections; there
+ arbiters are certainly targets - so a "includeArbs" bool would be necessary if we want to make
+ not fetching them herein happen.
+ */
+ rs.getTargets(L, ver);
+ multiCommand(cmd, L);
+ int nok = 0;
+ allUp = true;
+ for( list<Target>::iterator i = L.begin(); i != L.end(); i++ ) {
+ if( i->ok ) {
+ nok++;
+ if( i->result["fresher"].trueValue() ) {
+ log() << "not electing self, we are not freshest" << rsLog;
+ return false;
+ }
+ OpTime remoteOrd( i->result["opTime"].Date() );
+ if( remoteOrd == ord )
+ nTies++;
+ assert( remoteOrd <= ord );
+
+ if( i->result["veto"].trueValue() ) {
+ BSONElement msg = i->result["errmsg"];
+ if (!msg.eoo()) {
+ log() << "not electing self, " << i->toHost << " would veto with '" <<
+ msg.String() << "'" << rsLog;
+ }
+ else {
+ log() << "not electing self, " << i->toHost << " would veto" << rsLog;
+ }
+ return false;
+ }
+ }
+ else {
+ DEV log() << "replSet freshest returns " << i->result.toString() << rsLog;
+ allUp = false;
+ }
+ }
+ LOG(1) << "replSet dev we are freshest of up nodes, nok:" << nok << " nTies:" << nTies << rsLog;
+ assert( ord <= theReplSet->lastOpTimeWritten ); // <= as this may change while we are working...
+ return true;
+ }
+
+ extern time_t started;
+
+ void Consensus::multiCommand(BSONObj cmd, list<Target>& L) {
+ assert( !rs.lockedByMe() );
+ mongo::multiCommand(cmd, L);
+ }
+
+ void Consensus::_electSelf() {
+ if( time(0) < steppedDown )
+ return;
+
+ {
+ const OpTime ord = theReplSet->lastOpTimeWritten;
+ if( ord == 0 ) {
+ log() << "replSet info not trying to elect self, do not yet have a complete set of data from any point in time" << rsLog;
+ return;
+ }
+ }
+
+ bool allUp;
+ int nTies;
+ if( !weAreFreshest(allUp, nTies) ) {
+ return;
+ }
+
+ rs.sethbmsg("",9);
+
+ if( !allUp && time(0) - started < 60 * 5 ) {
+ /* the idea here is that if a bunch of nodes bounce all at once, we don't want to drop data
+ if we don't have to -- we'd rather be offline and wait a little longer instead
+ todo: make this configurable.
+ */
+ rs.sethbmsg("not electing self, not all members up and we have been up less than 5 minutes");
+ return;
+ }
+
+ Member& me = *rs._self;
+
+ if( nTies ) {
+ /* tie? we then randomly sleep to try to not collide on our voting. */
+ /* todo: smarter. */
+ if( me.id() == 0 || sleptLast ) {
+ // would be fine for one node not to sleep
+ // todo: biggest / highest priority nodes should be the ones that get to not sleep
+ }
+ else {
+ assert( !rs.lockedByMe() ); // bad to go to sleep locked
+ unsigned ms = ((unsigned) rand()) % 1000 + 50;
+ DEV log() << "replSet tie " << nTies << " sleeping a little " << ms << "ms" << rsLog;
+ sleptLast = true;
+ sleepmillis(ms);
+ throw RetryAfterSleepException();
+ }
+ }
+ sleptLast = false;
+
+ time_t start = time(0);
+ unsigned meid = me.id();
+ int tally = yea( meid );
+ bool success = false;
+ try {
+ log() << "replSet info electSelf " << meid << rsLog;
+
+ BSONObj electCmd = BSON(
+ "replSetElect" << 1 <<
+ "set" << rs.name() <<
+ "who" << me.fullName() <<
+ "whoid" << me.hbinfo().id() <<
+ "cfgver" << rs._cfg->version <<
+ "round" << OID::gen() /* this is just for diagnostics */
+ );
+
+ int configVersion;
+ list<Target> L;
+ rs.getTargets(L, configVersion);
+ multiCommand(electCmd, L);
+
+ {
+ for( list<Target>::iterator i = L.begin(); i != L.end(); i++ ) {
+ DEV log() << "replSet elect res: " << i->result.toString() << rsLog;
+ if( i->ok ) {
+ int v = i->result["vote"].Int();
+ tally += v;
+ }
+ }
+ if( tally*2 <= totalVotes() ) {
+ log() << "replSet couldn't elect self, only received " << tally << " votes" << rsLog;
+ }
+ else if( time(0) - start > 30 ) {
+ // defensive; should never happen as we have timeouts on connection and operation for our conn
+ log() << "replSet too much time passed during our election, ignoring result" << rsLog;
+ }
+ else if( configVersion != rs.config().version ) {
+ log() << "replSet config version changed during our election, ignoring result" << rsLog;
+ }
+ else {
+ /* succeeded. */
+ log(1) << "replSet election succeeded, assuming primary role" << rsLog;
+ success = true;
+ rs.assumePrimary();
+ }
+ }
+ }
+ catch( std::exception& ) {
+ if( !success ) electionFailed(meid);
+ throw;
+ }
+ if( !success ) electionFailed(meid);
+ }
+
+ void Consensus::electSelf() {
+ assert( !rs.lockedByMe() );
+ assert( !rs.myConfig().arbiterOnly );
+ assert( rs.myConfig().slaveDelay == 0 );
+ try {
+ _electSelf();
+ }
+ catch(RetryAfterSleepException&) {
+ throw;
+ }
+ catch(VoteException& ) {
+ log() << "replSet not trying to elect self as responded yea to someone else recently" << rsLog;
+ }
+ catch(DBException& e) {
+ log() << "replSet warning caught unexpected exception in electSelf() " << e.toString() << rsLog;
+ }
+ catch(...) {
+ log() << "replSet warning caught unexpected exception in electSelf()" << rsLog;
+ }
+ }
+
+}
diff --git a/src/mongo/db/repl/health.cpp b/src/mongo/db/repl/health.cpp
new file mode 100644
index 00000000000..0b7ed87eac3
--- /dev/null
+++ b/src/mongo/db/repl/health.cpp
@@ -0,0 +1,449 @@
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,b
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "rs.h"
+#include "health.h"
+#include "../../util/background.h"
+#include "../../client/dbclient.h"
+#include "../../client/connpool.h"
+#include "../commands.h"
+#include "../../util/concurrency/value.h"
+#include "../../util/concurrency/task.h"
+#include "../../util/mongoutils/html.h"
+#include "../../util/goodies.h"
+#include "../../util/ramlog.h"
+#include "../helpers/dblogger.h"
+#include "connections.h"
+#include "../../util/unittest.h"
+#include "../dbhelpers.h"
+
+namespace mongo {
+ /* decls for connections.h */
+ ScopedConn::M& ScopedConn::_map = *(new ScopedConn::M());
+ mutex ScopedConn::mapMutex("ScopedConn::mapMutex");
+}
+
+namespace mongo {
+
+ using namespace mongoutils::html;
+ using namespace bson;
+
+ static RamLog * _rsLog = new RamLog( "rs" );
+ Tee *rsLog = _rsLog;
+ extern bool replSetBlind; // for testing
+
+ string ago(time_t t) {
+ if( t == 0 ) return "";
+
+ time_t x = time(0) - t;
+ stringstream s;
+ if( x < 180 ) {
+ s << x << " sec";
+ if( x != 1 ) s << 's';
+ }
+ else if( x < 3600 ) {
+ s.precision(2);
+ s << x / 60.0 << " mins";
+ }
+ else {
+ s.precision(2);
+ s << x / 3600.0 << " hrs";
+ }
+ return s.str();
+ }
+
+ void Member::summarizeMember(stringstream& s) const {
+ s << tr();
+ {
+ stringstream u;
+ u << "http://" << h().host() << ':' << (h().port() + 1000) << "/_replSet";
+ s << td( a(u.str(), "", fullName()) );
+ }
+ s << td( id() );
+ double h = hbinfo().health;
+ bool ok = h > 0;
+ s << td(red(str::stream() << h,h == 0));
+ s << td(ago(hbinfo().upSince));
+ bool never = false;
+ {
+ string h;
+ time_t hb = hbinfo().lastHeartbeat;
+ if( hb == 0 ) {
+ h = "never";
+ never = true;
+ }
+ else h = ago(hb) + " ago";
+ s << td(h);
+ }
+ s << td(config().votes);
+ s << td(config().priority);
+ {
+ string stateText = state().toString();
+ if( _config.hidden )
+ stateText += " (hidden)";
+ if( ok || stateText.empty() )
+ s << td(stateText); // text blank if we've never connected
+ else
+ s << td( grey(str::stream() << "(was " << state().toString() << ')', true) );
+ }
+ s << td( grey(hbinfo().lastHeartbeatMsg,!ok) );
+ stringstream q;
+ q << "/_replSetOplog?_id=" << id();
+ s << td( a(q.str(), "", never ? "?" : hbinfo().opTime.toString()) );
+ if( hbinfo().skew > INT_MIN ) {
+ s << td( grey(str::stream() << hbinfo().skew,!ok) );
+ }
+ else
+ s << td("");
+ s << _tr();
+ }
+
+ string ReplSetImpl::stateAsHtml(MemberState s) {
+ if( s.s == MemberState::RS_STARTUP ) return a("", "serving still starting up, or still trying to initiate the set", "STARTUP");
+ if( s.s == MemberState::RS_PRIMARY ) return a("", "this server thinks it is primary", "PRIMARY");
+ if( s.s == MemberState::RS_SECONDARY ) return a("", "this server thinks it is a secondary (slave mode)", "SECONDARY");
+ if( s.s == MemberState::RS_RECOVERING ) return a("", "recovering/resyncing; after recovery usually auto-transitions to secondary", "RECOVERING");
+ if( s.s == MemberState::RS_FATAL ) return a("", "something bad has occurred and server is not completely offline with regard to the replica set. fatal error.", "FATAL");
+ if( s.s == MemberState::RS_STARTUP2 ) return a("", "loaded config, still determining who is primary", "STARTUP2");
+ if( s.s == MemberState::RS_ARBITER ) return a("", "this server is an arbiter only", "ARBITER");
+ if( s.s == MemberState::RS_DOWN ) return a("", "member is down, slow, or unreachable", "DOWN");
+ if( s.s == MemberState::RS_ROLLBACK ) return a("", "rolling back operations to get in sync", "ROLLBACK");
+ return "";
+ }
+
+ extern time_t started;
+
+ // oplogdiags in web ui
+ static void say(stringstream&ss, const bo& op) {
+ ss << "<tr>";
+
+ set<string> skip;
+ be e = op["ts"];
+ if( e.type() == Date || e.type() == Timestamp ) {
+ OpTime ot = e._opTime();
+ ss << td( time_t_to_String_short( ot.getSecs() ) );
+ ss << td( ot.toString() );
+ skip.insert("ts");
+ }
+ else ss << td("?") << td("?");
+
+ e = op["h"];
+ if( e.type() == NumberLong ) {
+ ss << "<td>" << hex << e.Long() << "</td>\n";
+ skip.insert("h");
+ }
+ else
+ ss << td("?");
+
+ ss << td(op["op"].valuestrsafe());
+ ss << td(op["ns"].valuestrsafe());
+ skip.insert("op");
+ skip.insert("ns");
+
+ ss << "<td>";
+ for( bo::iterator i(op); i.more(); ) {
+ be e = i.next();
+ if( skip.count(e.fieldName()) ) continue;
+ ss << e.toString() << ' ';
+ }
+ ss << "</td></tr>\n";
+ }
+
+ void ReplSetImpl::_getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const {
+ const Member *m = findById(server_id);
+ if( m == 0 ) {
+ ss << "Error : can't find a member with id: " << server_id << '\n';
+ return;
+ }
+
+ ss << p("Server : " + m->fullName() + "<br>ns : " + rsoplog );
+
+ //const bo fields = BSON( "o" << false << "o2" << false );
+ const bo fields;
+
+ /** todo fix we might want an so timeout here */
+ DBClientConnection conn(false, 0, /*timeout*/ 20);
+ {
+ string errmsg;
+ if( !conn.connect(m->fullName(), errmsg) ) {
+ ss << "couldn't connect to " << m->fullName() << ' ' << errmsg;
+ return;
+ }
+ }
+
+ auto_ptr<DBClientCursor> c = conn.query(rsoplog, Query().sort("$natural",1), 20, 0, &fields);
+ if( c.get() == 0 ) {
+ ss << "couldn't query " << rsoplog;
+ return;
+ }
+ static const char *h[] = {"ts","optime", "h","op","ns","rest",0};
+
+ ss << "<style type=\"text/css\" media=\"screen\">"
+ "table { font-size:75% }\n"
+ // "th { background-color:#bbb; color:#000 }\n"
+ // "td,th { padding:.25em }\n"
+ "</style>\n";
+
+ ss << table(h, true);
+ //ss << "<pre>\n";
+ int n = 0;
+ OpTime otFirst;
+ OpTime otLast;
+ OpTime otEnd;
+ while( c->more() ) {
+ bo o = c->next();
+ otLast = o["ts"]._opTime();
+ if( otFirst.isNull() )
+ otFirst = otLast;
+ say(ss, o);
+ n++;
+ }
+ if( n == 0 ) {
+ ss << rsoplog << " is empty\n";
+ }
+ else {
+ auto_ptr<DBClientCursor> c = conn.query(rsoplog, Query().sort("$natural",-1), 20, 0, &fields);
+ if( c.get() == 0 ) {
+ ss << "couldn't query [2] " << rsoplog;
+ return;
+ }
+ string x;
+ bo o = c->next();
+ otEnd = o["ts"]._opTime();
+ while( 1 ) {
+ stringstream z;
+ if( o["ts"]._opTime() == otLast )
+ break;
+ say(z, o);
+ x = z.str() + x;
+ if( !c->more() )
+ break;
+ o = c->next();
+ }
+ if( !x.empty() ) {
+ ss << "<tr><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td></tr>\n" << x;
+ //ss << "\n...\n\n" << x;
+ }
+ }
+ ss << _table();
+ ss << p(time_t_to_String_short(time(0)) + " current time");
+
+ if( !otEnd.isNull() ) {
+ ss << "<p>Log length in time: ";
+ unsigned d = otEnd.getSecs() - otFirst.getSecs();
+ double h = d / 3600.0;
+ ss.precision(3);
+ if( h < 72 )
+ ss << h << " hours";
+ else
+ ss << h / 24.0 << " days";
+ ss << "</p>\n";
+ }
+ }
+
+ void ReplSetImpl::_summarizeAsHtml(stringstream& s) const {
+ s << table(0, false);
+ s << tr("Set name:", _name);
+ s << tr("Majority up:", elect.aMajoritySeemsToBeUp()?"yes":"no" );
+ s << _table();
+
+ const char *h[] = {"Member",
+ "<a title=\"member id in the replset config\">id</a>",
+ "Up",
+ "<a title=\"length of time we have been continuously connected to the other member with no reconnects (for self, shows uptime)\">cctime</a>",
+ "<a title=\"when this server last received a heartbeat response - includes error code responses\">Last heartbeat</a>",
+ "Votes", "Priority", "State", "Messages",
+ "<a title=\"how up to date this server is. this value polled every few seconds so actually lag is typically much lower than value shown here.\">optime</a>",
+ "<a title=\"Clock skew in seconds relative to this server. Informational; server clock variances will make the diagnostics hard to read, but otherwise are benign..\">skew</a>",
+ 0
+ };
+ s << table(h);
+
+ /* this is to sort the member rows by their ordinal _id, so they show up in the same
+ order on all the different web ui's; that is less confusing for the operator. */
+ map<int,string> mp;
+
+ string myMinValid;
+ try {
+ readlocktry lk("local.replset.minvalid", 300);
+ if( lk.got() ) {
+ BSONObj mv;
+ if( Helpers::getSingleton("local.replset.minvalid", mv) ) {
+ myMinValid = "minvalid:" + mv["ts"]._opTime().toString();
+ }
+ }
+ else myMinValid = ".";
+ }
+ catch(...) {
+ myMinValid = "exception fetching minvalid";
+ }
+
+ const Member *_self = this->_self;
+ assert(_self);
+ {
+ stringstream s;
+ /* self row */
+ s << tr() << td(_self->fullName() + " (me)") <<
+ td(_self->id()) <<
+ td("1") << //up
+ td(ago(started)) <<
+ td("") << // last heartbeat
+ td(ToString(_self->config().votes)) <<
+ td(ToString(_self->config().priority)) <<
+ td( stateAsHtml(box.getState()) + (_self->config().hidden?" (hidden)":"") );
+ s << td( _hbmsg );
+ stringstream q;
+ q << "/_replSetOplog?_id=" << _self->id();
+ s << td( a(q.str(), myMinValid, theReplSet->lastOpTimeWritten.toString()) );
+ s << td(""); // skew
+ s << _tr();
+ mp[_self->hbinfo().id()] = s.str();
+ }
+ Member *m = head();
+ while( m ) {
+ stringstream s;
+ m->summarizeMember(s);
+ mp[m->hbinfo().id()] = s.str();
+ m = m->next();
+ }
+
+ for( map<int,string>::const_iterator i = mp.begin(); i != mp.end(); i++ )
+ s << i->second;
+ s << _table();
+ }
+
+
+ void fillRsLog(stringstream& s) {
+ _rsLog->toHTML( s );
+ }
+
+ const Member* ReplSetImpl::findById(unsigned id) const {
+ if( _self && id == _self->id() ) return _self;
+
+ for( Member *m = head(); m; m = m->next() )
+ if( m->id() == id )
+ return m;
+ return 0;
+ }
+
+ const OpTime ReplSetImpl::lastOtherOpTime() const {
+ OpTime closest(0,0);
+
+ for( Member *m = _members.head(); m; m=m->next() ) {
+ if (!m->hbinfo().up()) {
+ continue;
+ }
+
+ if (m->hbinfo().opTime > closest) {
+ closest = m->hbinfo().opTime;
+ }
+ }
+
+ return closest;
+ }
+
+ void ReplSetImpl::_summarizeStatus(BSONObjBuilder& b) const {
+ vector<BSONObj> v;
+
+ const Member *_self = this->_self;
+ assert( _self );
+
+ MemberState myState = box.getState();
+
+ // add self
+ {
+ BSONObjBuilder bb;
+ bb.append("_id", (int) _self->id());
+ bb.append("name", _self->fullName());
+ bb.append("health", 1.0);
+ bb.append("state", (int)myState.s);
+ bb.append("stateStr", myState.toString());
+ bb.append("uptime", (unsigned)(time(0) - cmdLine.started));
+ if (!_self->config().arbiterOnly) {
+ bb.appendTimestamp("optime", lastOpTimeWritten.asDate());
+ bb.appendDate("optimeDate", lastOpTimeWritten.getSecs() * 1000LL);
+ }
+
+ int maintenance = _maintenanceMode;
+ if (maintenance) {
+ bb.append("maintenanceMode", maintenance);
+ }
+
+ if (theReplSet) {
+ string s = theReplSet->hbmsg();
+ if( !s.empty() )
+ bb.append("errmsg", s);
+ }
+ bb.append("self", true);
+ v.push_back(bb.obj());
+ }
+
+ Member *m =_members.head();
+ while( m ) {
+ BSONObjBuilder bb;
+ bb.append("_id", (int) m->id());
+ bb.append("name", m->fullName());
+ double h = m->hbinfo().health;
+ bb.append("health", h);
+ bb.append("state", (int) m->state().s);
+ if( h == 0 ) {
+ // if we can't connect the state info is from the past and could be confusing to show
+ bb.append("stateStr", "(not reachable/healthy)");
+ }
+ else {
+ bb.append("stateStr", m->state().toString());
+ }
+ bb.append("uptime", (unsigned) (m->hbinfo().upSince ? (time(0)-m->hbinfo().upSince) : 0));
+ if (!m->config().arbiterOnly) {
+ bb.appendTimestamp("optime", m->hbinfo().opTime.asDate());
+ bb.appendDate("optimeDate", m->hbinfo().opTime.getSecs() * 1000LL);
+ }
+ bb.appendTimeT("lastHeartbeat", m->hbinfo().lastHeartbeat);
+ bb.append("pingMs", m->hbinfo().ping);
+ string s = m->lhb();
+ if( !s.empty() )
+ bb.append("errmsg", s);
+
+ if (m->hbinfo().authIssue) {
+ bb.append("authenticated", false);
+ }
+
+ v.push_back(bb.obj());
+ m = m->next();
+ }
+ sort(v.begin(), v.end());
+ b.append("set", name());
+ b.appendTimeT("date", time(0));
+ b.append("myState", myState.s);
+ const Member *syncTarget = _currentSyncTarget;
+ if (syncTarget && myState != MemberState::RS_PRIMARY) {
+ b.append("syncingTo", syncTarget->fullName());
+ }
+ b.append("members", v);
+ if( replSetBlind )
+ b.append("blind",true); // to avoid confusion if set...normally never set except for testing.
+ }
+
+ static struct Test : public UnitTest {
+ void run() {
+ HealthOptions a,b;
+ assert( a == b );
+ assert( a.isDefault() );
+ }
+ } test;
+
+}
diff --git a/src/mongo/db/repl/health.h b/src/mongo/db/repl/health.h
new file mode 100644
index 00000000000..55cca93a27e
--- /dev/null
+++ b/src/mongo/db/repl/health.h
@@ -0,0 +1,50 @@
+// replset.h
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+
+ /* throws */
+ bool requestHeartbeat(string setname, string fromHost, string memberFullName, BSONObj& result, int myConfigVersion, int& theirConfigVersion, bool checkEmpty = false);
+
+ struct HealthOptions {
+ HealthOptions() :
+ heartbeatSleepMillis(2000),
+ heartbeatTimeoutMillis( 10000 ),
+ heartbeatConnRetries(2)
+ { }
+
+ bool isDefault() const { return *this == HealthOptions(); }
+
+ // see http://www.mongodb.org/display/DOCS/Replica+Set+Internals
+ unsigned heartbeatSleepMillis;
+ unsigned heartbeatTimeoutMillis;
+ unsigned heartbeatConnRetries ;
+
+ void check() {
+ uassert(13112, "bad replset heartbeat option", heartbeatSleepMillis >= 10);
+ uassert(13113, "bad replset heartbeat option", heartbeatTimeoutMillis >= 10);
+ }
+
+ bool operator==(const HealthOptions& r) const {
+ return heartbeatSleepMillis==r.heartbeatSleepMillis && heartbeatTimeoutMillis==r.heartbeatTimeoutMillis && heartbeatConnRetries==r.heartbeatConnRetries;
+ }
+ };
+
+}
diff --git a/src/mongo/db/repl/heartbeat.cpp b/src/mongo/db/repl/heartbeat.cpp
new file mode 100644
index 00000000000..331812af85a
--- /dev/null
+++ b/src/mongo/db/repl/heartbeat.cpp
@@ -0,0 +1,382 @@
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,b
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "rs.h"
+#include "health.h"
+#include "../../util/background.h"
+#include "../../client/dbclient.h"
+#include "../commands.h"
+#include "../../util/concurrency/value.h"
+#include "../../util/concurrency/task.h"
+#include "../../util/concurrency/msg.h"
+#include "../../util/mongoutils/html.h"
+#include "../../util/goodies.h"
+#include "../../util/ramlog.h"
+#include "../helpers/dblogger.h"
+#include "connections.h"
+#include "../../util/unittest.h"
+#include "../instance.h"
+#include "../repl.h"
+
+namespace mongo {
+
+ using namespace bson;
+
+ extern bool replSetBlind;
+ extern ReplSettings replSettings;
+
+ unsigned int HeartbeatInfo::numPings;
+
+ long long HeartbeatInfo::timeDown() const {
+ if( up() ) return 0;
+ if( downSince == 0 )
+ return 0; // still waiting on first heartbeat
+ return jsTime() - downSince;
+ }
+
+ /* { replSetHeartbeat : <setname> } */
+ class CmdReplSetHeartbeat : public ReplSetCommand {
+ public:
+ CmdReplSetHeartbeat() : ReplSetCommand("replSetHeartbeat") { }
+ virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ if( replSetBlind ) {
+ if (theReplSet) {
+ errmsg = str::stream() << theReplSet->selfFullName() << " is blind";
+ }
+ return false;
+ }
+
+ /* we don't call ReplSetCommand::check() here because heartbeat
+ checks many things that are pre-initialization. */
+ if( !replSet ) {
+ errmsg = "not running with --replSet";
+ return false;
+ }
+
+ if (!checkAuth(errmsg, result)) {
+ return false;
+ }
+
+ /* we want to keep heartbeat connections open when relinquishing primary. tag them here. */
+ {
+ AbstractMessagingPort *mp = cc().port();
+ if( mp )
+ mp->tag |= 1;
+ }
+
+ if( cmdObj["pv"].Int() != 1 ) {
+ errmsg = "incompatible replset protocol version";
+ return false;
+ }
+ {
+ string s = string(cmdObj.getStringField("replSetHeartbeat"));
+ if( cmdLine.ourSetName() != s ) {
+ errmsg = "repl set names do not match";
+ log() << "replSet set names do not match, our cmdline: " << cmdLine._replSet << rsLog;
+ log() << "replSet s: " << s << rsLog;
+ result.append("mismatch", true);
+ return false;
+ }
+ }
+
+ result.append("rs", true);
+ if( cmdObj["checkEmpty"].trueValue() ) {
+ result.append("hasData", replHasDatabases());
+ }
+ if( theReplSet == 0 ) {
+ string from( cmdObj.getStringField("from") );
+ if( !from.empty() ) {
+ scoped_lock lck( replSettings.discoveredSeeds_mx );
+ replSettings.discoveredSeeds.insert(from);
+ }
+ result.append("hbmsg", "still initializing");
+ return true;
+ }
+
+ if( theReplSet->name() != cmdObj.getStringField("replSetHeartbeat") ) {
+ errmsg = "repl set names do not match (2)";
+ result.append("mismatch", true);
+ return false;
+ }
+ result.append("set", theReplSet->name());
+ result.append("state", theReplSet->state().s);
+ result.append("e", theReplSet->iAmElectable());
+ result.append("hbmsg", theReplSet->hbmsg());
+ result.append("time", (long long) time(0));
+ result.appendDate("opTime", theReplSet->lastOpTimeWritten.asDate());
+ int v = theReplSet->config().version;
+ result.append("v", v);
+ if( v > cmdObj["v"].Int() )
+ result << "config" << theReplSet->config().asBson();
+
+ return true;
+ }
+ } cmdReplSetHeartbeat;
+
+ bool requestHeartbeat(string setName, string from, string memberFullName, BSONObj& result,
+ int myCfgVersion, int& theirCfgVersion, bool checkEmpty) {
+ if( replSetBlind ) {
+ return false;
+ }
+
+ BSONObj cmd = BSON( "replSetHeartbeat" << setName <<
+ "v" << myCfgVersion <<
+ "pv" << 1 <<
+ "checkEmpty" << checkEmpty <<
+ "from" << from );
+
+ // generally not a great idea to do outbound waiting calls in a
+ // write lock. heartbeats can be slow (multisecond to respond), so
+ // generally we don't want to be locked, at least not without
+ // thinking acarefully about it first.
+ uassert(15900, "can't heartbeat: too much lock",
+ !d.dbMutex.isWriteLocked() || theReplSet == 0 || !theReplSet->lockedByMe() );
+
+ ScopedConn conn(memberFullName);
+ return conn.runCommand("admin", cmd, result, 0);
+ }
+
+ /**
+ * Poll every other set member to check its status.
+ *
+ * A detail about local machines and authentication: suppose we have 2
+ * members, A and B, on the same machine using different keyFiles. A is
+ * primary. If we're just starting the set, there are no admin users, so A
+ * and B can access each other because it's local access.
+ *
+ * Then we add a user to A. B cannot sync this user from A, because as soon
+ * as we add a an admin user, A requires auth. However, A can still
+ * heartbeat B, because B *doesn't* have an admin user. So A can reach B
+ * but B cannot reach A.
+ *
+ * Once B is restarted with the correct keyFile, everything should work as
+ * expected.
+ */
+ class ReplSetHealthPollTask : public task::Task {
+ private:
+ HostAndPort h;
+ HeartbeatInfo m;
+ int tries;
+ const int threshold;
+ public:
+ ReplSetHealthPollTask(const HostAndPort& hh, const HeartbeatInfo& mm)
+ : h(hh), m(mm), tries(0), threshold(15) { }
+
+ string name() const { return "rsHealthPoll"; }
+ void doWork() {
+ if ( !theReplSet ) {
+ LOG(2) << "replSet not initialized yet, skipping health poll this round" << rsLog;
+ return;
+ }
+
+ HeartbeatInfo mem = m;
+ HeartbeatInfo old = mem;
+ try {
+ BSONObj info;
+ int theirConfigVersion = -10000;
+
+ bool ok = _requestHeartbeat(mem, info, theirConfigVersion);
+
+ // weight new ping with old pings
+ // on the first ping, just use the ping value
+ if (old.ping != 0) {
+ mem.ping = (unsigned int)((old.ping * .8) + (mem.ping * .2));
+ }
+
+ if( ok ) {
+ up(info, mem);
+ }
+ else if (!info["errmsg"].eoo() &&
+ info["errmsg"].str() == "need to login") {
+ authIssue(mem);
+ }
+ else {
+ down(mem, info.getStringField("errmsg"));
+ }
+ }
+ catch(DBException& e) {
+ down(mem, e.what());
+ }
+ catch(...) {
+ down(mem, "replSet unexpected exception in ReplSetHealthPollTask");
+ }
+ m = mem;
+
+ theReplSet->mgr->send( boost::bind(&ReplSet::msgUpdateHBInfo, theReplSet, mem) );
+
+ static time_t last = 0;
+ time_t now = time(0);
+ bool changed = mem.changed(old);
+ if( changed ) {
+ if( old.hbstate != mem.hbstate )
+ log() << "replSet member " << h.toString() << " is now in state " << mem.hbstate.toString() << rsLog;
+ }
+ if( changed || now-last>4 ) {
+ last = now;
+ theReplSet->mgr->send( boost::bind(&Manager::msgCheckNewState, theReplSet->mgr) );
+ }
+ }
+
+ private:
+ bool _requestHeartbeat(HeartbeatInfo& mem, BSONObj& info, int& theirConfigVersion) {
+ if (tries++ % threshold == (threshold - 1)) {
+ ScopedConn conn(h.toString());
+ conn.reconnect();
+ }
+
+ Timer timer;
+ time_t before = curTimeMicros64() / 1000000;
+
+ bool ok = requestHeartbeat(theReplSet->name(), theReplSet->selfFullName(),
+ h.toString(), info, theReplSet->config().version, theirConfigVersion);
+
+ mem.ping = (unsigned int)timer.millis();
+
+ // we set this on any response - we don't get this far if
+ // couldn't connect because exception is thrown
+ time_t after = mem.lastHeartbeat = before + (mem.ping / 1000);
+
+ if ( info["time"].isNumber() ) {
+ long long t = info["time"].numberLong();
+ if( t > after )
+ mem.skew = (int) (t - after);
+ else if( t < before )
+ mem.skew = (int) (t - before); // negative
+ }
+ else {
+ // it won't be there if remote hasn't initialized yet
+ if( info.hasElement("time") )
+ warning() << "heatbeat.time isn't a number: " << info << endl;
+ mem.skew = INT_MIN;
+ }
+
+ {
+ be state = info["state"];
+ if( state.ok() )
+ mem.hbstate = MemberState(state.Int());
+ }
+
+ return ok;
+ }
+
+ void authIssue(HeartbeatInfo& mem) {
+ mem.authIssue = true;
+ mem.hbstate = MemberState::RS_UNKNOWN;
+
+ // set health to 0 so that this doesn't count towards majority
+ mem.health = 0.0;
+ theReplSet->rmFromElectable(mem.id());
+ }
+
+ void down(HeartbeatInfo& mem, string msg) {
+ mem.authIssue = false;
+ mem.health = 0.0;
+ mem.ping = 0;
+ if( mem.upSince || mem.downSince == 0 ) {
+ mem.upSince = 0;
+ mem.downSince = jsTime();
+ mem.hbstate = MemberState::RS_DOWN;
+ log() << "replSet info " << h.toString() << " is down (or slow to respond): " << msg << rsLog;
+ }
+ mem.lastHeartbeatMsg = msg;
+ theReplSet->rmFromElectable(mem.id());
+ }
+
+ void up(const BSONObj& info, HeartbeatInfo& mem) {
+ HeartbeatInfo::numPings++;
+ mem.authIssue = false;
+
+ if( mem.upSince == 0 ) {
+ log() << "replSet member " << h.toString() << " is up" << rsLog;
+ mem.upSince = mem.lastHeartbeat;
+ }
+ mem.health = 1.0;
+ mem.lastHeartbeatMsg = info["hbmsg"].String();
+ if( info.hasElement("opTime") )
+ mem.opTime = info["opTime"].Date();
+
+ // see if this member is in the electable set
+ if( info["e"].eoo() ) {
+ // for backwards compatibility
+ const Member *member = theReplSet->findById(mem.id());
+ if (member && member->config().potentiallyHot()) {
+ theReplSet->addToElectable(mem.id());
+ }
+ else {
+ theReplSet->rmFromElectable(mem.id());
+ }
+ }
+ // add this server to the electable set if it is within 10
+ // seconds of the latest optime we know of
+ else if( info["e"].trueValue() &&
+ mem.opTime >= theReplSet->lastOpTimeWritten.getSecs() - 10) {
+ unsigned lastOp = theReplSet->lastOtherOpTime().getSecs();
+ if (lastOp > 0 && mem.opTime >= lastOp - 10) {
+ theReplSet->addToElectable(mem.id());
+ }
+ }
+ else {
+ theReplSet->rmFromElectable(mem.id());
+ }
+
+ be cfg = info["config"];
+ if( cfg.ok() ) {
+ // received a new config
+ boost::function<void()> f =
+ boost::bind(&Manager::msgReceivedNewConfig, theReplSet->mgr, cfg.Obj().copy());
+ theReplSet->mgr->send(f);
+ }
+ }
+ };
+
+ void ReplSetImpl::endOldHealthTasks() {
+ unsigned sz = healthTasks.size();
+ for( set<ReplSetHealthPollTask*>::iterator i = healthTasks.begin(); i != healthTasks.end(); i++ )
+ (*i)->halt();
+ healthTasks.clear();
+ if( sz )
+ DEV log() << "replSet debug: cleared old tasks " << sz << endl;
+ }
+
+ void ReplSetImpl::startHealthTaskFor(Member *m) {
+ ReplSetHealthPollTask *task = new ReplSetHealthPollTask(m->h(), m->hbinfo());
+ healthTasks.insert(task);
+ task::repeat(task, 2000);
+ }
+
+ void startSyncThread();
+
+ /** called during repl set startup. caller expects it to return fairly quickly.
+ note ReplSet object is only created once we get a config - so this won't run
+ until the initiation.
+ */
+ void ReplSetImpl::startThreads() {
+ task::fork(mgr);
+ mgr->send( boost::bind(&Manager::msgCheckNewState, theReplSet->mgr) );
+
+ boost::thread t(startSyncThread);
+
+ task::fork(ghost);
+
+ // member heartbeats are started in ReplSetImpl::initFromConfig
+ }
+
+}
+
+/* todo:
+ stop bg job and delete on removefromset
+*/
diff --git a/src/mongo/db/repl/manager.cpp b/src/mongo/db/repl/manager.cpp
new file mode 100644
index 00000000000..91648a1b506
--- /dev/null
+++ b/src/mongo/db/repl/manager.cpp
@@ -0,0 +1,274 @@
+/* @file manager.cpp
+*/
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,b
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "rs.h"
+#include "connections.h"
+#include "../client.h"
+
+namespace mongo {
+
+ enum {
+ NOPRIMARY = -2,
+ SELFPRIMARY = -1
+ };
+
+ /* check members OTHER THAN US to see if they think they are primary */
+ const Member * Manager::findOtherPrimary(bool& two) {
+ two = false;
+ Member *m = rs->head();
+ Member *p = 0;
+ while( m ) {
+ DEV assert( m != rs->_self );
+ if( m->state().primary() && m->hbinfo().up() ) {
+ if( p ) {
+ two = true;
+ return 0;
+ }
+ p = m;
+ }
+ m = m->next();
+ }
+ if( p )
+ noteARemoteIsPrimary(p);
+ return p;
+ }
+
+ Manager::Manager(ReplSetImpl *_rs) :
+ task::Server("rsMgr"), rs(_rs), busyWithElectSelf(false), _primary(NOPRIMARY) {
+ }
+
+ Manager::~Manager() {
+ /* we don't destroy the replset object we sit in; however, the destructor could have thrown on init.
+ the log message below is just a reminder to come back one day and review this code more, and to
+ make it cleaner.
+ */
+ log() << "info: ~Manager called" << rsLog;
+ rs->mgr = 0;
+ }
+
+ void Manager::starting() {
+ Client::initThread("rsMgr");
+ replLocalAuth();
+ }
+
+ void Manager::noteARemoteIsPrimary(const Member *m) {
+ if( rs->box.getPrimary() == m )
+ return;
+ rs->_self->lhb() = "";
+ if( rs->iAmArbiterOnly() ) {
+ rs->box.set(MemberState::RS_ARBITER, m);
+ }
+ else {
+ rs->box.noteRemoteIsPrimary(m);
+ }
+ }
+
+ void Manager::checkElectableSet() {
+ unsigned otherOp = rs->lastOtherOpTime().getSecs();
+
+ // make sure the electable set is up-to-date
+ if (rs->elect.aMajoritySeemsToBeUp() &&
+ rs->iAmPotentiallyHot() &&
+ (otherOp == 0 || rs->lastOpTimeWritten.getSecs() >= otherOp - 10)) {
+ theReplSet->addToElectable(rs->selfId());
+ }
+ else {
+ theReplSet->rmFromElectable(rs->selfId());
+ }
+
+ // check if we should ask the primary (possibly ourselves) to step down
+ const Member *highestPriority = theReplSet->getMostElectable();
+ const Member *primary = rs->box.getPrimary();
+
+ if (primary && highestPriority &&
+ highestPriority->config().priority > primary->config().priority) {
+ log() << "stepping down " << primary->fullName() << endl;
+
+ if (primary->h().isSelf()) {
+ // replSetStepDown tries to acquire the same lock
+ // msgCheckNewState takes, so we can't call replSetStepDown on
+ // ourselves.
+ rs->relinquish();
+ }
+ else {
+ BSONObj cmd = BSON( "replSetStepDown" << 1 );
+ ScopedConn conn(primary->fullName());
+ BSONObj result;
+ if (!conn.runCommand("admin", cmd, result, 0)) {
+ log() << "stepping down " << primary->fullName()
+ << " failed: " << result << endl;
+ }
+ }
+ }
+ }
+
+ void Manager::checkAuth() {
+ int down = 0, authIssue = 0, total = 0;
+
+ for( Member *m = rs->head(); m; m=m->next() ) {
+ total++;
+
+ // all authIssue servers will also be not up
+ if (!m->hbinfo().up()) {
+ down++;
+ if (m->hbinfo().authIssue) {
+ authIssue++;
+ }
+ }
+ }
+
+ // if all nodes are down or failed auth AND at least one failed
+ // auth, go into recovering. If all nodes are down, stay a
+ // secondary.
+ if (authIssue > 0 && down == total) {
+ log() << "replset error could not reach/authenticate against any members" << endl;
+
+ if (rs->box.getPrimary() == rs->_self) {
+ log() << "auth problems, relinquishing primary" << rsLog;
+ rs->relinquish();
+ }
+
+ rs->blockSync(true);
+ }
+ else {
+ rs->blockSync(false);
+ }
+ }
+
+ /** called as the health threads get new results */
+ void Manager::msgCheckNewState() {
+ {
+ theReplSet->assertValid();
+ rs->assertValid();
+
+ RSBase::lock lk(rs);
+
+ if( busyWithElectSelf ) return;
+
+ checkElectableSet();
+ checkAuth();
+
+ const Member *p = rs->box.getPrimary();
+ if( p && p != rs->_self ) {
+ if( !p->hbinfo().up() ||
+ !p->hbinfo().hbstate.primary() ) {
+ p = 0;
+ rs->box.setOtherPrimary(0);
+ }
+ }
+
+ const Member *p2;
+ {
+ bool two;
+ p2 = findOtherPrimary(two);
+ if( two ) {
+ /* two other nodes think they are primary (asynchronously polled) -- wait for things to settle down. */
+ log() << "replSet info two primaries (transiently)" << rsLog;
+ return;
+ }
+ }
+
+ if( p2 ) {
+ /* someone else thinks they are primary. */
+ if( p == p2 ) {
+ // we thought the same; all set.
+ return;
+ }
+ if( p == 0 ) {
+ noteARemoteIsPrimary(p2);
+ return;
+ }
+ // todo xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+ if( p != rs->_self ) {
+ // switch primary from oldremotep->newremotep2
+ noteARemoteIsPrimary(p2);
+ return;
+ }
+ /* we thought we were primary, yet now someone else thinks they are. */
+ if( !rs->elect.aMajoritySeemsToBeUp() ) {
+ /* we can't see a majority. so the other node is probably the right choice. */
+ noteARemoteIsPrimary(p2);
+ return;
+ }
+ /* ignore for now, keep thinking we are master.
+ this could just be timing (we poll every couple seconds) or could indicate
+ a problem? if it happens consistently for a duration of time we should
+ alert the sysadmin.
+ */
+ return;
+ }
+
+ /* didn't find anyone who wants to be primary */
+
+ if( p ) {
+ /* we are already primary */
+
+ if( p != rs->_self ) {
+ rs->sethbmsg("error p != rs->self in checkNewState");
+ log() << "replSet " << p->fullName() << rsLog;
+ log() << "replSet " << rs->_self->fullName() << rsLog;
+ return;
+ }
+
+ if( rs->elect.shouldRelinquish() ) {
+ log() << "can't see a majority of the set, relinquishing primary" << rsLog;
+ rs->relinquish();
+ }
+
+ return;
+ }
+
+ if( !rs->iAmPotentiallyHot() ) { // if not we never try to be primary
+ OCCASIONALLY log() << "replSet I don't see a primary and I can't elect myself" << endl;
+ return;
+ }
+
+ /* no one seems to be primary. shall we try to elect ourself? */
+ if( !rs->elect.aMajoritySeemsToBeUp() ) {
+ static time_t last;
+ static int n;
+ int ll = 0;
+ if( ++n > 5 ) ll++;
+ if( last + 60 > time(0 ) ) ll++;
+ log(ll) << "replSet can't see a majority, will not try to elect self" << rsLog;
+ last = time(0);
+ return;
+ }
+
+ if( !rs->iAmElectable() ) {
+ return;
+ }
+
+ busyWithElectSelf = true; // don't try to do further elections & such while we are already working on one.
+ }
+ try {
+ rs->elect.electSelf();
+ }
+ catch(RetryAfterSleepException&) {
+ /* we want to process new inbounds before trying this again. so we just put a checkNewstate in the queue for eval later. */
+ requeue();
+ }
+ catch(...) {
+ log() << "replSet error unexpected assertion in rs manager" << rsLog;
+ }
+ busyWithElectSelf = false;
+ }
+
+}
diff --git a/src/mongo/db/repl/multicmd.h b/src/mongo/db/repl/multicmd.h
new file mode 100644
index 00000000000..2d70c551f64
--- /dev/null
+++ b/src/mongo/db/repl/multicmd.h
@@ -0,0 +1,75 @@
+// @file multicmd.h
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../../util/background.h"
+#include "connections.h"
+
+namespace mongo {
+
+ struct Target {
+ Target(string hostport) : toHost(hostport), ok(false) { }
+ //Target() : ok(false) { }
+ const string toHost;
+ bool ok;
+ BSONObj result;
+ };
+
+ /** send a command to several servers in parallel. waits for all to complete before
+ returning.
+
+ in: Target::toHost
+ out: Target::result and Target::ok
+ */
+ void multiCommand(BSONObj cmd, list<Target>& L);
+
+ class _MultiCommandJob : public BackgroundJob {
+ public:
+ BSONObj& cmd;
+ Target& d;
+ _MultiCommandJob(BSONObj& _cmd, Target& _d) : cmd(_cmd), d(_d) { }
+
+ private:
+ string name() const { return "MultiCommandJob"; }
+ void run() {
+ try {
+ ScopedConn c(d.toHost);
+ d.ok = c.runCommand("admin", cmd, d.result);
+ }
+ catch(DBException&) {
+ DEV log() << "dev caught dbexception on multiCommand " << d.toHost << rsLog;
+ }
+ }
+ };
+
+ inline void multiCommand(BSONObj cmd, list<Target>& L) {
+ list< shared_ptr<BackgroundJob> > jobs;
+
+ for( list<Target>::iterator i = L.begin(); i != L.end(); i++ ) {
+ Target& d = *i;
+ _MultiCommandJob *j = new _MultiCommandJob(cmd, d);
+ jobs.push_back( shared_ptr<BackgroundJob>(j) );
+ j->go();
+ }
+
+ for( list< shared_ptr<BackgroundJob> >::iterator i = jobs.begin(); i != jobs.end(); i++ ) {
+ (*i)->wait();
+ }
+ }
+}
diff --git a/src/mongo/db/repl/replset_commands.cpp b/src/mongo/db/repl/replset_commands.cpp
new file mode 100644
index 00000000000..84f16e53466
--- /dev/null
+++ b/src/mongo/db/repl/replset_commands.cpp
@@ -0,0 +1,404 @@
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../cmdline.h"
+#include "../commands.h"
+#include "../repl.h"
+#include "health.h"
+#include "rs.h"
+#include "rs_config.h"
+#include "../dbwebserver.h"
+#include "../../util/mongoutils/html.h"
+#include "../../client/dbclient.h"
+#include "../repl_block.h"
+
+using namespace bson;
+
+namespace mongo {
+
+ void checkMembersUpForConfigChange(const ReplSetConfig& cfg, BSONObjBuilder& result, bool initial);
+
+ /* commands in other files:
+ replSetHeartbeat - health.cpp
+ replSetInitiate - rs_mod.cpp
+ */
+
+ bool replSetBlind = false;
+ unsigned replSetForceInitialSyncFailure = 0;
+
+ class CmdReplSetTest : public ReplSetCommand {
+ public:
+ virtual void help( stringstream &help ) const {
+ help << "Just for regression tests.\n";
+ }
+ CmdReplSetTest() : ReplSetCommand("replSetTest") { }
+ virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ log() << "replSet replSetTest command received: " << cmdObj.toString() << rsLog;
+
+ if (!checkAuth(errmsg, result)) {
+ return false;
+ }
+
+ if( cmdObj.hasElement("forceInitialSyncFailure") ) {
+ replSetForceInitialSyncFailure = (unsigned) cmdObj["forceInitialSyncFailure"].Number();
+ return true;
+ }
+
+ if( !check(errmsg, result) )
+ return false;
+
+ if( cmdObj.hasElement("blind") ) {
+ replSetBlind = cmdObj.getBoolField("blind");
+ return true;
+ }
+
+ if (cmdObj.hasElement("sethbmsg")) {
+ replset::sethbmsg(cmdObj["sethbmsg"].String());
+ return true;
+ }
+
+ return false;
+ }
+ } cmdReplSetTest;
+
+ /** get rollback id. used to check if a rollback happened during some interval of time.
+ as consumed, the rollback id is not in any particular order, it simply changes on each rollback.
+ @see incRBID()
+ */
+ class CmdReplSetGetRBID : public ReplSetCommand {
+ public:
+ /* todo: ideally this should only change on rollbacks NOT on mongod restarts also. fix... */
+ int rbid;
+ virtual void help( stringstream &help ) const {
+ help << "internal";
+ }
+ CmdReplSetGetRBID() : ReplSetCommand("replSetGetRBID") {
+ // this is ok but micros or combo with some rand() and/or 64 bits might be better --
+ // imagine a restart and a clock correction simultaneously (very unlikely but possible...)
+ rbid = (int) curTimeMillis64();
+ }
+ virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ if( !check(errmsg, result) )
+ return false;
+ result.append("rbid",rbid);
+ return true;
+ }
+ } cmdReplSetRBID;
+
+ /** we increment the rollback id on every rollback event. */
+ void incRBID() {
+ cmdReplSetRBID.rbid++;
+ }
+
+ /** helper to get rollback id from another server. */
+ int getRBID(DBClientConnection *c) {
+ bo info;
+ c->simpleCommand("admin", &info, "replSetGetRBID");
+ return info["rbid"].numberInt();
+ }
+
+ class CmdReplSetGetStatus : public ReplSetCommand {
+ public:
+ virtual void help( stringstream &help ) const {
+ help << "Report status of a replica set from the POV of this server\n";
+ help << "{ replSetGetStatus : 1 }";
+ help << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands";
+ }
+ CmdReplSetGetStatus() : ReplSetCommand("replSetGetStatus", true) { }
+ virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ if ( cmdObj["forShell"].trueValue() )
+ lastError.disableForCommand();
+
+ if( !check(errmsg, result) )
+ return false;
+ theReplSet->summarizeStatus(result);
+ return true;
+ }
+ } cmdReplSetGetStatus;
+
+ class CmdReplSetReconfig : public ReplSetCommand {
+ RWLock mutex; /* we don't need rw but we wanted try capability. :-( */
+ public:
+ virtual void help( stringstream &help ) const {
+ help << "Adjust configuration of a replica set\n";
+ help << "{ replSetReconfig : config_object }";
+ help << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands";
+ }
+ CmdReplSetReconfig() : ReplSetCommand("replSetReconfig"), mutex("rsreconfig") { }
+ virtual bool run(const string& a, BSONObj& b, int e, string& errmsg, BSONObjBuilder& c, bool d) {
+ try {
+ rwlock_try_write lk(mutex);
+ return _run(a,b,e,errmsg,c,d);
+ }
+ catch(rwlock_try_write::exception&) { }
+ errmsg = "a replSetReconfig is already in progress";
+ return false;
+ }
+ private:
+ bool _run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ if ( !checkAuth(errmsg, result) ) {
+ return false;
+ }
+
+ if( cmdObj["replSetReconfig"].type() != Object ) {
+ errmsg = "no configuration specified";
+ return false;
+ }
+
+ bool force = cmdObj.hasField("force") && cmdObj["force"].trueValue();
+ if( force && !theReplSet ) {
+ replSettings.reconfig = cmdObj["replSetReconfig"].Obj().getOwned();
+ result.append("msg", "will try this config momentarily, try running rs.conf() again in a few seconds");
+ return true;
+ }
+
+ if ( !check(errmsg, result) ) {
+ return false;
+ }
+
+ if( !force && !theReplSet->box.getState().primary() ) {
+ errmsg = "replSetReconfig command must be sent to the current replica set primary.";
+ return false;
+ }
+
+ {
+ // just make sure we can get a write lock before doing anything else. we'll reacquire one
+ // later. of course it could be stuck then, but this check lowers the risk if weird things
+ // are up - we probably don't want a change to apply 30 minutes after the initial attempt.
+ time_t t = time(0);
+ writelock lk("");
+ if( time(0)-t > 20 ) {
+ errmsg = "took a long time to get write lock, so not initiating. Initiate when server less busy?";
+ return false;
+ }
+ }
+
+ try {
+ ReplSetConfig newConfig(cmdObj["replSetReconfig"].Obj(), force);
+
+ log() << "replSet replSetReconfig config object parses ok, " << newConfig.members.size() << " members specified" << rsLog;
+
+ if( !ReplSetConfig::legalChange(theReplSet->getConfig(), newConfig, errmsg) ) {
+ return false;
+ }
+
+ checkMembersUpForConfigChange(newConfig, result, false);
+
+ log() << "replSet replSetReconfig [2]" << rsLog;
+
+ theReplSet->haveNewConfig(newConfig, true);
+ ReplSet::startupStatusMsg.set("replSetReconfig'd");
+ }
+ catch( DBException& e ) {
+ log() << "replSet replSetReconfig exception: " << e.what() << rsLog;
+ throw;
+ }
+ catch( string& se ) {
+ log() << "replSet reconfig exception: " << se << rsLog;
+ errmsg = se;
+ return false;
+ }
+
+ resetSlaveCache();
+ return true;
+ }
+ } cmdReplSetReconfig;
+
+ class CmdReplSetFreeze : public ReplSetCommand {
+ public:
+ virtual void help( stringstream &help ) const {
+ help << "{ replSetFreeze : <seconds> }";
+ help << "'freeze' state of member to the extent we can do that. What this really means is that\n";
+ help << "this node will not attempt to become primary until the time period specified expires.\n";
+ help << "You can call again with {replSetFreeze:0} to unfreeze sooner.\n";
+ help << "A process restart unfreezes the member also.\n";
+ help << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands";
+ }
+
+ CmdReplSetFreeze() : ReplSetCommand("replSetFreeze") { }
+ virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ if( !check(errmsg, result) )
+ return false;
+ int secs = (int) cmdObj.firstElement().numberInt();
+ if( theReplSet->freeze(secs) ) {
+ if( secs == 0 )
+ result.append("info","unfreezing");
+ }
+ if( secs == 1 )
+ result.append("warning", "you really want to freeze for only 1 second?");
+ return true;
+ }
+ } cmdReplSetFreeze;
+
+ class CmdReplSetStepDown: public ReplSetCommand {
+ public:
+ virtual void help( stringstream &help ) const {
+ help << "{ replSetStepDown : <seconds> }\n";
+ help << "Step down as primary. Will not try to reelect self for the specified time period (1 minute if no numeric secs value specified).\n";
+ help << "(If another member with same priority takes over in the meantime, it will stay primary.)\n";
+ help << "http://www.mongodb.org/display/DOCS/Replica+Set+Commands";
+ }
+
+ CmdReplSetStepDown() : ReplSetCommand("replSetStepDown") { }
+ virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ if( !check(errmsg, result) )
+ return false;
+ if( !theReplSet->box.getState().primary() ) {
+ errmsg = "not primary so can't step down";
+ return false;
+ }
+
+ bool force = cmdObj.hasField("force") && cmdObj["force"].trueValue();
+
+ // only step down if there is another node synced to within 10
+ // seconds of this node
+ if (!force) {
+ long long int lastOp = (long long int)theReplSet->lastOpTimeWritten.getSecs();
+ long long int closest = (long long int)theReplSet->lastOtherOpTime().getSecs();
+
+ long long int diff = lastOp - closest;
+ result.append("closest", closest);
+ result.append("difference", diff);
+
+ if (diff < 0) {
+ // not our problem, but we'll wait until thing settle down
+ errmsg = "someone is ahead of the primary?";
+ return false;
+ }
+
+ if (diff > 10) {
+ errmsg = "no secondaries within 10 seconds of my optime";
+ return false;
+ }
+ }
+
+ int secs = (int) cmdObj.firstElement().numberInt();
+ if( secs == 0 )
+ secs = 60;
+ return theReplSet->stepDown(secs);
+ }
+ } cmdReplSetStepDown;
+
+ class CmdReplSetMaintenance: public ReplSetCommand {
+ public:
+ virtual void help( stringstream &help ) const {
+ help << "{ replSetMaintenance : bool }\n";
+ help << "Enable or disable maintenance mode.";
+ }
+
+ CmdReplSetMaintenance() : ReplSetCommand("replSetMaintenance") { }
+ virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ if( !check(errmsg, result) )
+ return false;
+ if( theReplSet->box.getState().primary() ) {
+ errmsg = "primaries can't modify maintenance mode";
+ return false;
+ }
+
+ theReplSet->setMaintenanceMode(cmdObj["replSetMaintenance"].trueValue());
+ return true;
+ }
+ } cmdReplSetMaintenance;
+
+ using namespace bson;
+ using namespace mongoutils::html;
+ extern void fillRsLog(stringstream&);
+
+ class ReplSetHandler : public DbWebHandler {
+ public:
+ ReplSetHandler() : DbWebHandler( "_replSet" , 1 , true ) {}
+
+ virtual bool handles( const string& url ) const {
+ return startsWith( url , "/_replSet" );
+ }
+
+ virtual void handle( const char *rq, string url, BSONObj params,
+ string& responseMsg, int& responseCode,
+ vector<string>& headers, const SockAddr &from ) {
+
+ if( url == "/_replSetOplog" ) {
+ responseMsg = _replSetOplog(params);
+ }
+ else
+ responseMsg = _replSet();
+ responseCode = 200;
+ }
+
+ string _replSetOplog(bo parms) {
+ int _id = (int) str::toUnsigned( parms["_id"].String() );
+
+ stringstream s;
+ string t = "Replication oplog";
+ s << start(t);
+ s << p(t);
+
+ if( theReplSet == 0 ) {
+ if( cmdLine._replSet.empty() )
+ s << p("Not using --replSet");
+ else {
+ s << p("Still starting up, or else set is not yet " + a("http://www.mongodb.org/display/DOCS/Replica+Set+Configuration#InitialSetup", "", "initiated")
+ + ".<br>" + ReplSet::startupStatusMsg.get());
+ }
+ }
+ else {
+ try {
+ theReplSet->getOplogDiagsAsHtml(_id, s);
+ }
+ catch(std::exception& e) {
+ s << "error querying oplog: " << e.what() << '\n';
+ }
+ }
+
+ s << _end();
+ return s.str();
+ }
+
+ /* /_replSet show replica set status in html format */
+ string _replSet() {
+ stringstream s;
+ s << start("Replica Set Status " + prettyHostName());
+ s << p( a("/", "back", "Home") + " | " +
+ a("/local/system.replset/?html=1", "", "View Replset Config") + " | " +
+ a("/replSetGetStatus?text=1", "", "replSetGetStatus") + " | " +
+ a("http://www.mongodb.org/display/DOCS/Replica+Sets", "", "Docs")
+ );
+
+ if( theReplSet == 0 ) {
+ if( cmdLine._replSet.empty() )
+ s << p("Not using --replSet");
+ else {
+ s << p("Still starting up, or else set is not yet " + a("http://www.mongodb.org/display/DOCS/Replica+Set+Configuration#InitialSetup", "", "initiated")
+ + ".<br>" + ReplSet::startupStatusMsg.get());
+ }
+ }
+ else {
+ try {
+ theReplSet->summarizeAsHtml(s);
+ }
+ catch(...) { s << "error summarizing replset status\n"; }
+ }
+ s << p("Recent replset log activity:");
+ fillRsLog(s);
+ s << _end();
+ return s.str();
+ }
+
+
+
+ } replSetHandler;
+
+}
diff --git a/src/mongo/db/repl/rs.cpp b/src/mongo/db/repl/rs.cpp
new file mode 100644
index 00000000000..fff5d72bcc0
--- /dev/null
+++ b/src/mongo/db/repl/rs.cpp
@@ -0,0 +1,778 @@
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../cmdline.h"
+#include "../../util/net/sock.h"
+#include "../client.h"
+#include "../../client/dbclient.h"
+#include "../dbhelpers.h"
+#include "../../s/d_logic.h"
+#include "rs.h"
+#include "connections.h"
+#include "../repl.h"
+#include "../instance.h"
+
+using namespace std;
+
+namespace mongo {
+
+ using namespace bson;
+
+ bool replSet = false;
+ ReplSet *theReplSet = 0;
+
+ bool isCurrentlyAReplSetPrimary() {
+ return theReplSet && theReplSet->isPrimary();
+ }
+
+ void replset::sethbmsg(const string& s, const int level) {
+ if (theReplSet) {
+ theReplSet->sethbmsg(s, logLevel);
+ }
+ }
+
+ void ReplSetImpl::sethbmsg(string s, int logLevel) {
+ static time_t lastLogged;
+ _hbmsgTime = time(0);
+
+ if( s == _hbmsg ) {
+ // unchanged
+ if( _hbmsgTime - lastLogged < 60 )
+ return;
+ }
+
+ unsigned sz = s.size();
+ if( sz >= 256 )
+ memcpy(_hbmsg, s.c_str(), 255);
+ else {
+ _hbmsg[sz] = 0;
+ memcpy(_hbmsg, s.c_str(), sz);
+ }
+ if( !s.empty() ) {
+ lastLogged = _hbmsgTime;
+ log(logLevel) << "replSet " << s << rsLog;
+ }
+ }
+
+ void ReplSetImpl::assumePrimary() {
+ LOG(2) << "replSet assuming primary" << endl;
+ assert( iAmPotentiallyHot() );
+ writelock lk("admin."); // so we are synchronized with _logOp()
+
+ // Make sure that new OpTimes are higher than existing ones even with clock skew
+ DBDirectClient c;
+ BSONObj lastOp = c.findOne( "local.oplog.rs", Query().sort(reverseNaturalObj), NULL, QueryOption_SlaveOk );
+ if ( !lastOp.isEmpty() ) {
+ OpTime::setLast( lastOp[ "ts" ].date() );
+ }
+
+ changeState(MemberState::RS_PRIMARY);
+ }
+
+ void ReplSetImpl::changeState(MemberState s) { box.change(s, _self); }
+
+ void ReplSetImpl::setMaintenanceMode(const bool inc) {
+ lock lk(this);
+
+ if (inc) {
+ log() << "replSet going into maintenance mode (" << _maintenanceMode << " other tasks)" << rsLog;
+
+ _maintenanceMode++;
+ changeState(MemberState::RS_RECOVERING);
+ }
+ else {
+ _maintenanceMode--;
+ // no need to change state, syncTail will try to go live as a secondary soon
+
+ log() << "leaving maintenance mode (" << _maintenanceMode << " other tasks)" << rsLog;
+ }
+ }
+
+ Member* ReplSetImpl::getMostElectable() {
+ lock lk(this);
+
+ Member *max = 0;
+
+ for (set<unsigned>::iterator it = _electableSet.begin(); it != _electableSet.end(); it++) {
+ const Member *temp = findById(*it);
+ if (!temp) {
+ log() << "couldn't find member: " << *it << endl;
+ _electableSet.erase(*it);
+ continue;
+ }
+ if (!max || max->config().priority < temp->config().priority) {
+ max = (Member*)temp;
+ }
+ }
+
+ return max;
+ }
+
+ const bool closeOnRelinquish = true;
+
+ void ReplSetImpl::relinquish() {
+ LOG(2) << "replSet attempting to relinquish" << endl;
+ if( box.getState().primary() ) {
+ {
+ writelock lk("admin."); // so we are synchronized with _logOp()
+
+ log() << "replSet relinquishing primary state" << rsLog;
+ changeState(MemberState::RS_SECONDARY);
+ }
+
+ if( closeOnRelinquish ) {
+ /* close sockets that were talking to us so they don't blithly send many writes that will fail
+ with "not master" (of course client could check result code, but in case they are not)
+ */
+ log() << "replSet closing client sockets after reqlinquishing primary" << rsLog;
+ MessagingPort::closeAllSockets(1);
+ }
+
+ // now that all connections were closed, strip this mongod from all sharding details
+ // if and when it gets promoted to a primary again, only then it should reload the sharding state
+ // the rationale here is that this mongod won't bring stale state when it regains primaryhood
+ shardingState.resetShardingState();
+
+ }
+ else if( box.getState().startup2() ) {
+ // ? add comment
+ changeState(MemberState::RS_RECOVERING);
+ }
+ }
+
+ /* look freshly for who is primary - includes relinquishing ourself. */
+ void ReplSetImpl::forgetPrimary() {
+ if( box.getState().primary() )
+ relinquish();
+ else {
+ box.setOtherPrimary(0);
+ }
+ }
+
+ // for the replSetStepDown command
+ bool ReplSetImpl::_stepDown(int secs) {
+ lock lk(this);
+ if( box.getState().primary() ) {
+ elect.steppedDown = time(0) + secs;
+ log() << "replSet info stepping down as primary secs=" << secs << rsLog;
+ relinquish();
+ return true;
+ }
+ return false;
+ }
+
+ bool ReplSetImpl::_freeze(int secs) {
+ lock lk(this);
+ /* note if we are primary we remain primary but won't try to elect ourself again until
+ this time period expires.
+ */
+ if( secs == 0 ) {
+ elect.steppedDown = 0;
+ log() << "replSet info 'unfreezing'" << rsLog;
+ }
+ else {
+ if( !box.getState().primary() ) {
+ elect.steppedDown = time(0) + secs;
+ log() << "replSet info 'freezing' for " << secs << " seconds" << rsLog;
+ }
+ else {
+ log() << "replSet info received freeze command but we are primary" << rsLog;
+ }
+ }
+ return true;
+ }
+
+ void ReplSetImpl::msgUpdateHBInfo(HeartbeatInfo h) {
+ for( Member *m = _members.head(); m; m=m->next() ) {
+ if( m->id() == h.id() ) {
+ m->_hbinfo = h;
+ return;
+ }
+ }
+ }
+
+ list<HostAndPort> ReplSetImpl::memberHostnames() const {
+ list<HostAndPort> L;
+ L.push_back(_self->h());
+ for( Member *m = _members.head(); m; m = m->next() )
+ L.push_back(m->h());
+ return L;
+ }
+
+ void ReplSetImpl::_fillIsMasterHost(const Member *m, vector<string>& hosts, vector<string>& passives, vector<string>& arbiters) {
+ assert( m );
+ if( m->config().hidden )
+ return;
+
+ if( m->potentiallyHot() ) {
+ hosts.push_back(m->h().toString());
+ }
+ else if( !m->config().arbiterOnly ) {
+ if( m->config().slaveDelay ) {
+ /* hmmm - we don't list these as they are stale. */
+ }
+ else {
+ passives.push_back(m->h().toString());
+ }
+ }
+ else {
+ arbiters.push_back(m->h().toString());
+ }
+ }
+
+ void ReplSetImpl::_fillIsMaster(BSONObjBuilder& b) {
+ lock lk(this);
+
+ const StateBox::SP sp = box.get();
+ bool isp = sp.state.primary();
+ b.append("setName", name());
+ b.append("ismaster", isp);
+ b.append("secondary", sp.state.secondary());
+ {
+ vector<string> hosts, passives, arbiters;
+ _fillIsMasterHost(_self, hosts, passives, arbiters);
+
+ for( Member *m = _members.head(); m; m = m->next() ) {
+ assert( m );
+ _fillIsMasterHost(m, hosts, passives, arbiters);
+ }
+
+ if( hosts.size() > 0 ) {
+ b.append("hosts", hosts);
+ }
+ if( passives.size() > 0 ) {
+ b.append("passives", passives);
+ }
+ if( arbiters.size() > 0 ) {
+ b.append("arbiters", arbiters);
+ }
+ }
+
+ if( !isp ) {
+ const Member *m = sp.primary;
+ if( m )
+ b.append("primary", m->h().toString());
+ }
+ else {
+ b.append("primary", _self->fullName());
+ }
+
+ if( myConfig().arbiterOnly )
+ b.append("arbiterOnly", true);
+ if( myConfig().priority == 0 && !myConfig().arbiterOnly)
+ b.append("passive", true);
+ if( myConfig().slaveDelay )
+ b.append("slaveDelay", myConfig().slaveDelay);
+ if( myConfig().hidden )
+ b.append("hidden", true);
+ if( !myConfig().buildIndexes )
+ b.append("buildIndexes", false);
+ if( !myConfig().tags.empty() ) {
+ BSONObjBuilder a;
+ for( map<string,string>::const_iterator i = myConfig().tags.begin(); i != myConfig().tags.end(); i++ )
+ a.append((*i).first, (*i).second);
+ b.append("tags", a.done());
+ }
+ b.append("me", myConfig().h.toString());
+ }
+
+ /** @param cfgString <setname>/<seedhost1>,<seedhost2> */
+
+ void parseReplsetCmdLine(string cfgString, string& setname, vector<HostAndPort>& seeds, set<HostAndPort>& seedSet ) {
+ const char *p = cfgString.c_str();
+ const char *slash = strchr(p, '/');
+ if( slash )
+ setname = string(p, slash-p);
+ else
+ setname = p;
+ uassert(13093, "bad --replSet config string format is: <setname>[/<seedhost1>,<seedhost2>,...]", !setname.empty());
+
+ if( slash == 0 )
+ return;
+
+ p = slash + 1;
+ while( 1 ) {
+ const char *comma = strchr(p, ',');
+ if( comma == 0 ) comma = strchr(p,0);
+ if( p == comma )
+ break;
+ {
+ HostAndPort m;
+ try {
+ m = HostAndPort( string(p, comma-p) );
+ }
+ catch(...) {
+ uassert(13114, "bad --replSet seed hostname", false);
+ }
+ uassert(13096, "bad --replSet command line config string - dups?", seedSet.count(m) == 0 );
+ seedSet.insert(m);
+ //uassert(13101, "can't use localhost in replset host list", !m.isLocalHost());
+ if( m.isSelf() ) {
+ log(1) << "replSet ignoring seed " << m.toString() << " (=self)" << rsLog;
+ }
+ else
+ seeds.push_back(m);
+ if( *comma == 0 )
+ break;
+ p = comma + 1;
+ }
+ }
+ }
+
+ ReplSetImpl::ReplSetImpl(ReplSetCmdline& replSetCmdline) : elect(this),
+ _currentSyncTarget(0),
+ _blockSync(false),
+ _hbmsgTime(0),
+ _self(0),
+ _maintenanceMode(0),
+ mgr( new Manager(this) ),
+ ghost( new GhostSync(this) ) {
+
+ _cfg = 0;
+ memset(_hbmsg, 0, sizeof(_hbmsg));
+ strcpy( _hbmsg , "initial startup" );
+ lastH = 0;
+ changeState(MemberState::RS_STARTUP);
+
+ _seeds = &replSetCmdline.seeds;
+
+ LOG(1) << "replSet beginning startup..." << rsLog;
+
+ loadConfig();
+
+ unsigned sss = replSetCmdline.seedSet.size();
+ for( Member *m = head(); m; m = m->next() ) {
+ replSetCmdline.seedSet.erase(m->h());
+ }
+ for( set<HostAndPort>::iterator i = replSetCmdline.seedSet.begin(); i != replSetCmdline.seedSet.end(); i++ ) {
+ if( i->isSelf() ) {
+ if( sss == 1 ) {
+ LOG(1) << "replSet warning self is listed in the seed list and there are no other seeds listed did you intend that?" << rsLog;
+ }
+ }
+ else {
+ log() << "replSet warning command line seed " << i->toString() << " is not present in the current repl set config" << rsLog;
+ }
+ }
+ }
+
+ void newReplUp();
+
+ void ReplSetImpl::loadLastOpTimeWritten(bool quiet) {
+ readlock lk(rsoplog);
+ BSONObj o;
+ if( Helpers::getLast(rsoplog, o) ) {
+ lastH = o["h"].numberLong();
+ lastOpTimeWritten = o["ts"]._opTime();
+ uassert(13290, "bad replSet oplog entry?", quiet || !lastOpTimeWritten.isNull());
+ }
+ }
+
+ /* call after constructing to start - returns fairly quickly after launching its threads */
+ void ReplSetImpl::_go() {
+ try {
+ loadLastOpTimeWritten();
+ }
+ catch(std::exception& e) {
+ log() << "replSet error fatal couldn't query the local " << rsoplog << " collection. Terminating mongod after 30 seconds." << rsLog;
+ log() << e.what() << rsLog;
+ sleepsecs(30);
+ dbexit( EXIT_REPLICATION_ERROR );
+ return;
+ }
+
+ changeState(MemberState::RS_STARTUP2);
+ startThreads();
+ newReplUp(); // oplog.cpp
+ }
+
+ ReplSetImpl::StartupStatus ReplSetImpl::startupStatus = PRESTART;
+ DiagStr ReplSetImpl::startupStatusMsg;
+
+ extern BSONObj *getLastErrorDefault;
+
+ void ReplSetImpl::setSelfTo(Member *m) {
+ // already locked in initFromConfig
+ _self = m;
+ _id = m->id();
+ _config = m->config();
+ if( m ) _buildIndexes = m->config().buildIndexes;
+ else _buildIndexes = true;
+ }
+
+ /** @param reconf true if this is a reconfiguration and not an initial load of the configuration.
+ @return true if ok; throws if config really bad; false if config doesn't include self
+ */
+ bool ReplSetImpl::initFromConfig(ReplSetConfig& c, bool reconf) {
+ /* NOTE: haveNewConfig() writes the new config to disk before we get here. So
+ we cannot error out at this point, except fatally. Check errors earlier.
+ */
+ lock lk(this);
+
+ if( getLastErrorDefault || !c.getLastErrorDefaults.isEmpty() ) {
+ // see comment in dbcommands.cpp for getlasterrordefault
+ getLastErrorDefault = new BSONObj( c.getLastErrorDefaults );
+ }
+
+ list<ReplSetConfig::MemberCfg*> newOnes;
+ // additive short-cuts the new config setup. If we are just adding a
+ // node/nodes and nothing else is changing, this is additive. If it's
+ // not a reconfig, we're not adding anything
+ bool additive = reconf;
+ {
+ unsigned nfound = 0;
+ int me = 0;
+ for( vector<ReplSetConfig::MemberCfg>::iterator i = c.members.begin(); i != c.members.end(); i++ ) {
+
+ ReplSetConfig::MemberCfg& m = *i;
+ if( m.h.isSelf() ) {
+ me++;
+ }
+
+ if( reconf ) {
+ if (m.h.isSelf() && (!_self || (int)_self->id() != m._id)) {
+ log() << "self doesn't match: " << m._id << rsLog;
+ assert(false);
+ }
+
+ const Member *old = findById(m._id);
+ if( old ) {
+ nfound++;
+ assert( (int) old->id() == m._id );
+ if( old->config() != m ) {
+ additive = false;
+ }
+ }
+ else {
+ newOnes.push_back(&m);
+ }
+ }
+ }
+ if( me == 0 ) {
+ _members.orphanAll();
+
+ // sending hbs must continue to pick up new config, so we leave
+ // hb threads alone
+
+ // close sockets to force clients to re-evaluate this member
+ MessagingPort::closeAllSockets(0);
+
+ // stop sync thread
+ box.set(MemberState::RS_STARTUP, 0);
+
+ // go into holding pattern
+ log() << "replSet error self not present in the repl set configuration:" << rsLog;
+ log() << c.toString() << rsLog;
+ return false;
+ }
+ uassert( 13302, "replSet error self appears twice in the repl set configuration", me<=1 );
+
+ // if we found different members that the original config, reload everything
+ if( reconf && config().members.size() != nfound )
+ additive = false;
+ }
+
+ _cfg = new ReplSetConfig(c);
+ assert( _cfg->ok() );
+ assert( _name.empty() || _name == _cfg->_id );
+ _name = _cfg->_id;
+ assert( !_name.empty() );
+
+ // this is a shortcut for simple changes
+ if( additive ) {
+ log() << "replSet info : additive change to configuration" << rsLog;
+ for( list<ReplSetConfig::MemberCfg*>::const_iterator i = newOnes.begin(); i != newOnes.end(); i++ ) {
+ ReplSetConfig::MemberCfg *m = *i;
+ Member *mi = new Member(m->h, m->_id, m, false);
+
+ /** we will indicate that new members are up() initially so that we don't relinquish our
+ primary state because we can't (transiently) see a majority. they should be up as we
+ check that new members are up before getting here on reconfig anyway.
+ */
+ mi->get_hbinfo().health = 0.1;
+
+ _members.push(mi);
+ startHealthTaskFor(mi);
+ }
+
+ // if we aren't creating new members, we may have to update the
+ // groups for the current ones
+ _cfg->updateMembers(_members);
+
+ return true;
+ }
+
+ // start with no members. if this is a reconfig, drop the old ones.
+ _members.orphanAll();
+
+ endOldHealthTasks();
+
+ int oldPrimaryId = -1;
+ {
+ const Member *p = box.getPrimary();
+ if( p )
+ oldPrimaryId = p->id();
+ }
+ forgetPrimary();
+
+ // not setting _self to 0 as other threads use _self w/o locking
+ int me = 0;
+
+ // For logging
+ string members = "";
+
+ for( vector<ReplSetConfig::MemberCfg>::iterator i = _cfg->members.begin(); i != _cfg->members.end(); i++ ) {
+ ReplSetConfig::MemberCfg& m = *i;
+ Member *mi;
+ members += ( members == "" ? "" : ", " ) + m.h.toString();
+ if( m.h.isSelf() ) {
+ assert( me++ == 0 );
+ mi = new Member(m.h, m._id, &m, true);
+ if (!reconf) {
+ log() << "replSet I am " << m.h.toString() << rsLog;
+ }
+ setSelfTo(mi);
+
+ if( (int)mi->id() == oldPrimaryId )
+ box.setSelfPrimary(mi);
+ }
+ else {
+ mi = new Member(m.h, m._id, &m, false);
+ _members.push(mi);
+ startHealthTaskFor(mi);
+ if( (int)mi->id() == oldPrimaryId )
+ box.setOtherPrimary(mi);
+ }
+ }
+
+ if( me == 0 ){
+ log() << "replSet warning did not detect own host in full reconfig, members " << members << " config: " << c << rsLog;
+ }
+
+ return true;
+ }
+
+ // Our own config must be the first one.
+ bool ReplSetImpl::_loadConfigFinish(vector<ReplSetConfig>& cfgs) {
+ int v = -1;
+ ReplSetConfig *highest = 0;
+ int myVersion = -2000;
+ int n = 0;
+ for( vector<ReplSetConfig>::iterator i = cfgs.begin(); i != cfgs.end(); i++ ) {
+ ReplSetConfig& cfg = *i;
+ if( ++n == 1 ) myVersion = cfg.version;
+ if( cfg.ok() && cfg.version > v ) {
+ highest = &cfg;
+ v = cfg.version;
+ }
+ }
+ assert( highest );
+
+ if( !initFromConfig(*highest) )
+ return false;
+
+ if( highest->version > myVersion && highest->version >= 0 ) {
+ log() << "replSet got config version " << highest->version << " from a remote, saving locally" << rsLog;
+ highest->saveConfigLocally(BSONObj());
+ }
+ return true;
+ }
+
+ void ReplSetImpl::loadConfig() {
+ while( 1 ) {
+ startupStatus = LOADINGCONFIG;
+ startupStatusMsg.set("loading " + rsConfigNs + " config (LOADINGCONFIG)");
+ LOG(1) << "loadConfig() " << rsConfigNs << endl;
+ try {
+ vector<ReplSetConfig> configs;
+ try {
+ configs.push_back( ReplSetConfig(HostAndPort::me()) );
+ }
+ catch(DBException& e) {
+ log() << "replSet exception loading our local replset configuration object : " << e.toString() << rsLog;
+ }
+ for( vector<HostAndPort>::const_iterator i = _seeds->begin(); i != _seeds->end(); i++ ) {
+ try {
+ configs.push_back( ReplSetConfig(*i) );
+ }
+ catch( DBException& e ) {
+ log() << "replSet exception trying to load config from " << *i << " : " << e.toString() << rsLog;
+ }
+ }
+ {
+ scoped_lock lck( replSettings.discoveredSeeds_mx );
+ if( replSettings.discoveredSeeds.size() > 0 ) {
+ for (set<string>::iterator i = replSettings.discoveredSeeds.begin();
+ i != replSettings.discoveredSeeds.end();
+ i++) {
+ try {
+ configs.push_back( ReplSetConfig(HostAndPort(*i)) );
+ }
+ catch( DBException& ) {
+ log(1) << "replSet exception trying to load config from discovered seed " << *i << rsLog;
+ replSettings.discoveredSeeds.erase(*i);
+ }
+ }
+ }
+ }
+
+ if (!replSettings.reconfig.isEmpty()) {
+ try {
+ configs.push_back(ReplSetConfig(replSettings.reconfig, true));
+ }
+ catch( DBException& re) {
+ log() << "replSet couldn't load reconfig: " << re.what() << rsLog;
+ replSettings.reconfig = BSONObj();
+ }
+ }
+
+ int nok = 0;
+ int nempty = 0;
+ for( vector<ReplSetConfig>::iterator i = configs.begin(); i != configs.end(); i++ ) {
+ if( i->ok() )
+ nok++;
+ if( i->empty() )
+ nempty++;
+ }
+ if( nok == 0 ) {
+
+ if( nempty == (int) configs.size() ) {
+ startupStatus = EMPTYCONFIG;
+ startupStatusMsg.set("can't get " + rsConfigNs + " config from self or any seed (EMPTYCONFIG)");
+ log() << "replSet can't get " << rsConfigNs << " config from self or any seed (EMPTYCONFIG)" << rsLog;
+ static unsigned once;
+ if( ++once == 1 ) {
+ log() << "replSet info you may need to run replSetInitiate -- rs.initiate() in the shell -- if that is not already done" << rsLog;
+ }
+ if( _seeds->size() == 0 ) {
+ LOG(1) << "replSet info no seed hosts were specified on the --replSet command line" << rsLog;
+ }
+ }
+ else {
+ startupStatus = EMPTYUNREACHABLE;
+ startupStatusMsg.set("can't currently get " + rsConfigNs + " config from self or any seed (EMPTYUNREACHABLE)");
+ log() << "replSet can't get " << rsConfigNs << " config from self or any seed (yet)" << rsLog;
+ }
+
+ sleepsecs(10);
+ continue;
+ }
+
+ if( !_loadConfigFinish(configs) ) {
+ log() << "replSet info Couldn't load config yet. Sleeping 20sec and will try again." << rsLog;
+ sleepsecs(20);
+ continue;
+ }
+ }
+ catch(DBException& e) {
+ startupStatus = BADCONFIG;
+ startupStatusMsg.set("replSet error loading set config (BADCONFIG)");
+ log() << "replSet error loading configurations " << e.toString() << rsLog;
+ log() << "replSet error replication will not start" << rsLog;
+ sethbmsg("error loading set config");
+ _fatal();
+ throw;
+ }
+ break;
+ }
+ startupStatusMsg.set("? started");
+ startupStatus = STARTED;
+ }
+
+ void ReplSetImpl::_fatal() {
+ box.set(MemberState::RS_FATAL, 0);
+ log() << "replSet error fatal, stopping replication" << rsLog;
+ }
+
+ void ReplSet::haveNewConfig(ReplSetConfig& newConfig, bool addComment) {
+ bo comment;
+ if( addComment )
+ comment = BSON( "msg" << "Reconfig set" << "version" << newConfig.version );
+
+ newConfig.saveConfigLocally(comment);
+
+ try {
+ if (initFromConfig(newConfig, true)) {
+ log() << "replSet replSetReconfig new config saved locally" << rsLog;
+ }
+ }
+ catch(DBException& e) {
+ if( e.getCode() == 13497 /* removed from set */ ) {
+ cc().shutdown();
+ dbexit( EXIT_CLEAN , "removed from replica set" ); // never returns
+ assert(0);
+ }
+ log() << "replSet error unexpected exception in haveNewConfig() : " << e.toString() << rsLog;
+ _fatal();
+ }
+ catch(...) {
+ log() << "replSet error unexpected exception in haveNewConfig()" << rsLog;
+ _fatal();
+ }
+ }
+
+ void Manager::msgReceivedNewConfig(BSONObj o) {
+ log() << "replset msgReceivedNewConfig version: " << o["version"].toString() << rsLog;
+ ReplSetConfig c(o);
+ if( c.version > rs->config().version )
+ theReplSet->haveNewConfig(c, false);
+ else {
+ log() << "replSet info msgReceivedNewConfig but version isn't higher " <<
+ c.version << ' ' << rs->config().version << rsLog;
+ }
+ }
+
+ /* forked as a thread during startup
+ it can run quite a while looking for config. but once found,
+ a separate thread takes over as ReplSetImpl::Manager, and this thread
+ terminates.
+ */
+ void startReplSets(ReplSetCmdline *replSetCmdline) {
+ Client::initThread("rsStart");
+ try {
+ assert( theReplSet == 0 );
+ if( replSetCmdline == 0 ) {
+ assert(!replSet);
+ return;
+ }
+ replLocalAuth();
+ (theReplSet = new ReplSet(*replSetCmdline))->go();
+ }
+ catch(std::exception& e) {
+ log() << "replSet caught exception in startReplSets thread: " << e.what() << rsLog;
+ if( theReplSet )
+ theReplSet->fatal();
+ }
+ cc().shutdown();
+ }
+
+ void replLocalAuth() {
+ if ( noauth )
+ return;
+ cc().getAuthenticationInfo()->authorize("local","_repl");
+ }
+
+
+}
+
+namespace boost {
+
+ void assertion_failed(char const * expr, char const * function, char const * file, long line) {
+ mongo::log() << "boost assertion failure " << expr << ' ' << function << ' ' << file << ' ' << line << endl;
+ }
+
+}
diff --git a/src/mongo/db/repl/rs.h b/src/mongo/db/repl/rs.h
new file mode 100644
index 00000000000..8e43204be3b
--- /dev/null
+++ b/src/mongo/db/repl/rs.h
@@ -0,0 +1,667 @@
+// /db/repl/rs.h
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../../util/concurrency/list.h"
+#include "../../util/concurrency/value.h"
+#include "../../util/concurrency/msg.h"
+#include "../../util/net/hostandport.h"
+#include "../commands.h"
+#include "../oplog.h"
+#include "../oplogreader.h"
+#include "rs_exception.h"
+#include "rs_optime.h"
+#include "rs_member.h"
+#include "rs_config.h"
+
+/**
+ * Order of Events
+ *
+ * On startup, if the --replSet option is present, startReplSets is called.
+ * startReplSets forks off a new thread for replica set activities. It creates
+ * the global theReplSet variable and calls go() on it.
+ *
+ * theReplSet's constructor changes the replica set's state to RS_STARTUP,
+ * starts the replica set manager, and loads the config (if the replica set
+ * has been initialized).
+ */
+
+namespace mongo {
+
+ struct HowToFixUp;
+ struct Target;
+ class DBClientConnection;
+ class ReplSetImpl;
+ class OplogReader;
+ extern bool replSet; // true if using repl sets
+ extern class ReplSet *theReplSet; // null until initialized
+ extern Tee *rsLog;
+
+ /* member of a replica set */
+ class Member : public List1<Member>::Base {
+ private:
+ ~Member(); // intentionally unimplemented as should never be called -- see List1<>::Base.
+ Member(const Member&);
+ public:
+ Member(HostAndPort h, unsigned ord, ReplSetConfig::MemberCfg *c, bool self);
+
+ string fullName() const { return h().toString(); }
+ const ReplSetConfig::MemberCfg& config() const { return _config; }
+ ReplSetConfig::MemberCfg& configw() { return _config; }
+ const HeartbeatInfo& hbinfo() const { return _hbinfo; }
+ HeartbeatInfo& get_hbinfo() { return _hbinfo; }
+ string lhb() const { return _hbinfo.lastHeartbeatMsg; }
+ MemberState state() const { return _hbinfo.hbstate; }
+ const HostAndPort& h() const { return _h; }
+ unsigned id() const { return _hbinfo.id(); }
+
+ bool potentiallyHot() const { return _config.potentiallyHot(); } // not arbiter, not priority 0
+ void summarizeMember(stringstream& s) const;
+
+ private:
+ friend class ReplSetImpl;
+ ReplSetConfig::MemberCfg _config;
+ const HostAndPort _h;
+ HeartbeatInfo _hbinfo;
+ };
+
+ namespace replset {
+ /**
+ * "Normal" replica set syncing
+ */
+ class SyncTail : public Sync {
+ public:
+ virtual ~SyncTail() {}
+ SyncTail(const string& host) : Sync(host) {}
+ virtual bool syncApply(const BSONObj &o);
+ };
+
+ /**
+ * Initial clone and sync
+ */
+ class InitialSync : public SyncTail {
+ public:
+ InitialSync(const string& host) : SyncTail(host) {}
+ virtual ~InitialSync() {}
+ bool oplogApplication(OplogReader& r, const Member* source, const OpTime& applyGTE, const OpTime& minValid);
+ virtual void applyOp(const BSONObj& o, const OpTime& minvalid);
+ };
+
+ // TODO: move hbmsg into an error-keeping class (SERVER-4444)
+ void sethbmsg(const string& s, const int logLevel=0);
+
+ } // namespace replset
+
+ class Manager : public task::Server {
+ ReplSetImpl *rs;
+ bool busyWithElectSelf;
+ int _primary;
+
+ /** @param two - if true two primaries were seen. this can happen transiently, in addition to our
+ polling being only occasional. in this case null is returned, but the caller should
+ not assume primary itself in that situation.
+ */
+ const Member* findOtherPrimary(bool& two);
+
+ void noteARemoteIsPrimary(const Member *);
+ void checkElectableSet();
+ void checkAuth();
+ virtual void starting();
+ public:
+ Manager(ReplSetImpl *rs);
+ virtual ~Manager();
+ void msgReceivedNewConfig(BSONObj);
+ void msgCheckNewState();
+ };
+
+ class GhostSync : public task::Server {
+ struct GhostSlave : boost::noncopyable {
+ GhostSlave() : last(0), slave(0), init(false) { }
+ OplogReader reader;
+ OpTime last;
+ Member* slave;
+ bool init;
+ };
+ /**
+ * This is a cache of ghost slaves
+ */
+ typedef map< mongo::OID,shared_ptr<GhostSlave> > MAP;
+ MAP _ghostCache;
+ RWLock _lock; // protects _ghostCache
+ ReplSetImpl *rs;
+ virtual void starting();
+ public:
+ GhostSync(ReplSetImpl *_rs) : task::Server("rsGhostSync"), _lock("GhostSync"), rs(_rs) {}
+ ~GhostSync() {
+ log() << "~GhostSync() called" << rsLog;
+ }
+
+ /**
+ * Replica sets can sync in a hierarchical fashion, which throws off w
+ * calculation on the master. percolate() faux-syncs from an upstream
+ * node so that the primary will know what the slaves are up to.
+ *
+ * We can't just directly sync to the primary because it could be
+ * unreachable, e.g., S1--->S2--->S3--->P. S2 should ghost sync from S3
+ * and S3 can ghost sync from the primary.
+ *
+ * Say we have an S1--->S2--->P situation and this node is S2. rid
+ * would refer to S1. S2 would create a ghost slave of S1 and connect
+ * it to P (_currentSyncTarget). Then it would use this connection to
+ * pretend to be S1, replicating off of P.
+ */
+ void percolate(const BSONObj& rid, const OpTime& last);
+ void associateSlave(const BSONObj& rid, const int memberId);
+ void updateSlave(const mongo::OID& id, const OpTime& last);
+ };
+
+ struct Target;
+
+ class Consensus {
+ ReplSetImpl &rs;
+ struct LastYea {
+ LastYea() : when(0), who(0xffffffff) { }
+ time_t when;
+ unsigned who;
+ };
+ static SimpleMutex lyMutex;
+ Guarded<LastYea,lyMutex> ly;
+ unsigned yea(unsigned memberId); // throws VoteException
+ void electionFailed(unsigned meid);
+ void _electSelf();
+ bool weAreFreshest(bool& allUp, int& nTies);
+ bool sleptLast; // slept last elect() pass
+ public:
+ Consensus(ReplSetImpl *t) : rs(*t) {
+ sleptLast = false;
+ steppedDown = 0;
+ }
+
+ /* if we've stepped down, this is when we are allowed to try to elect ourself again.
+ todo: handle possible weirdnesses at clock skews etc.
+ */
+ time_t steppedDown;
+
+ int totalVotes() const;
+ bool aMajoritySeemsToBeUp() const;
+ bool shouldRelinquish() const;
+ void electSelf();
+ void electCmdReceived(BSONObj, BSONObjBuilder*);
+ void multiCommand(BSONObj cmd, list<Target>& L);
+ };
+
+ /**
+ * most operations on a ReplSet object should be done while locked. that
+ * logic implemented here.
+ *
+ * Order of locking: lock the replica set, then take a rwlock.
+ */
+ class RSBase : boost::noncopyable {
+ public:
+ const unsigned magic;
+ void assertValid() { assert( magic == 0x12345677 ); }
+ private:
+ mongo::mutex m;
+ int _locked;
+ ThreadLocalValue<bool> _lockedByMe;
+ protected:
+ RSBase() : magic(0x12345677), m("RSBase"), _locked(0) { }
+ ~RSBase() {
+ /* this can happen if we throw in the constructor; otherwise never happens. thus we log it as it is quite unusual. */
+ log() << "replSet ~RSBase called" << rsLog;
+ }
+
+ public:
+ class lock {
+ RSBase& rsbase;
+ auto_ptr<scoped_lock> sl;
+ public:
+ lock(RSBase* b) : rsbase(*b) {
+ if( rsbase._lockedByMe.get() )
+ return; // recursive is ok...
+
+ sl.reset( new scoped_lock(rsbase.m) );
+ DEV assert(rsbase._locked == 0);
+ rsbase._locked++;
+ rsbase._lockedByMe.set(true);
+ }
+ ~lock() {
+ if( sl.get() ) {
+ assert( rsbase._lockedByMe.get() );
+ DEV assert(rsbase._locked == 1);
+ rsbase._lockedByMe.set(false);
+ rsbase._locked--;
+ }
+ }
+ };
+
+ /* for asserts */
+ bool locked() const { return _locked != 0; }
+
+ /* if true, is locked, and was locked by this thread. note if false, it could be in the lock or not for another
+ just for asserts & such so we can make the contracts clear on who locks what when.
+ we don't use these locks that frequently, so the little bit of overhead is fine.
+ */
+ bool lockedByMe() { return _lockedByMe.get(); }
+ };
+
+ class ReplSetHealthPollTask;
+
+ /* safe container for our state that keeps member pointer and state variables always aligned */
+ class StateBox : boost::noncopyable {
+ public:
+ struct SP { // SP is like pair<MemberState,const Member *> but nicer
+ SP() : state(MemberState::RS_STARTUP), primary(0) { }
+ MemberState state;
+ const Member *primary;
+ };
+ const SP get() {
+ rwlock lk(m, false);
+ return sp;
+ }
+ MemberState getState() const {
+ rwlock lk(m, false);
+ return sp.state;
+ }
+ const Member* getPrimary() const {
+ rwlock lk(m, false);
+ return sp.primary;
+ }
+ void change(MemberState s, const Member *self) {
+ rwlock lk(m, true);
+ if( sp.state != s ) {
+ log() << "replSet " << s.toString() << rsLog;
+ }
+ sp.state = s;
+ if( s.primary() ) {
+ sp.primary = self;
+ }
+ else {
+ if( self == sp.primary )
+ sp.primary = 0;
+ }
+ }
+ void set(MemberState s, const Member *p) {
+ rwlock lk(m, true);
+ sp.state = s;
+ sp.primary = p;
+ }
+ void setSelfPrimary(const Member *self) { change(MemberState::RS_PRIMARY, self); }
+ void setOtherPrimary(const Member *mem) {
+ rwlock lk(m, true);
+ assert( !sp.state.primary() );
+ sp.primary = mem;
+ }
+ void noteRemoteIsPrimary(const Member *remote) {
+ rwlock lk(m, true);
+ if( !sp.state.secondary() && !sp.state.fatal() )
+ sp.state = MemberState::RS_RECOVERING;
+ sp.primary = remote;
+ }
+ StateBox() : m("StateBox") { }
+ private:
+ RWLock m;
+ SP sp;
+ };
+
+ void parseReplsetCmdLine(string cfgString, string& setname, vector<HostAndPort>& seeds, set<HostAndPort>& seedSet );
+
+ /** Parameter given to the --replSet command line option (parsed).
+ Syntax is "<setname>/<seedhost1>,<seedhost2>"
+ where setname is a name and seedhost is "<host>[:<port>]" */
+ class ReplSetCmdline {
+ public:
+ ReplSetCmdline(string cfgString) { parseReplsetCmdLine(cfgString, setname, seeds, seedSet); }
+ string setname;
+ vector<HostAndPort> seeds;
+ set<HostAndPort> seedSet;
+ };
+
+ /* information about the entire repl set, such as the various servers in the set, and their state */
+ /* note: We currently do not free mem when the set goes away - it is assumed the replset is a
+ singleton and long lived.
+ */
+ class ReplSetImpl : protected RSBase {
+ public:
+ /** info on our state if the replset isn't yet "up". for example, if we are pre-initiation. */
+ enum StartupStatus {
+ PRESTART=0, LOADINGCONFIG=1, BADCONFIG=2, EMPTYCONFIG=3,
+ EMPTYUNREACHABLE=4, STARTED=5, SOON=6
+ };
+ static StartupStatus startupStatus;
+ static DiagStr startupStatusMsg;
+ static string stateAsHtml(MemberState state);
+
+ /* todo thread */
+ void msgUpdateHBInfo(HeartbeatInfo);
+
+ StateBox box;
+
+ OpTime lastOpTimeWritten;
+ long long lastH; // hash we use to make sure we are reading the right flow of ops and aren't on an out-of-date "fork"
+ private:
+ set<ReplSetHealthPollTask*> healthTasks;
+ void endOldHealthTasks();
+ void startHealthTaskFor(Member *m);
+
+ Consensus elect;
+ void relinquish();
+ void forgetPrimary();
+ protected:
+ bool _stepDown(int secs);
+ bool _freeze(int secs);
+ private:
+ void assumePrimary();
+ void loadLastOpTimeWritten(bool quiet=false);
+ void changeState(MemberState s);
+
+ /**
+ * Find the closest member (using ping time) with a higher latest optime.
+ */
+ Member* getMemberToSyncTo();
+ void veto(const string& host, unsigned secs=10);
+ Member* _currentSyncTarget;
+
+ bool _blockSync;
+ void blockSync(bool block);
+
+ // set of electable members' _ids
+ set<unsigned> _electableSet;
+ protected:
+ // "heartbeat message"
+ // sent in requestHeartbeat respond in field "hbm"
+ char _hbmsg[256]; // we change this unlocked, thus not an stl::string
+ time_t _hbmsgTime; // when it was logged
+ public:
+ void sethbmsg(string s, int logLevel = 0);
+
+ /**
+ * Election with Priorities
+ *
+ * Each node (n) keeps a set of nodes that could be elected primary.
+ * Each node in this set:
+ *
+ * 1. can connect to a majority of the set
+ * 2. has a priority greater than 0
+ * 3. has an optime within 10 seconds of the most up-to-date node
+ * that n can reach
+ *
+ * If a node fails to meet one or more of these criteria, it is removed
+ * from the list. This list is updated whenever the node receives a
+ * heartbeat.
+ *
+ * When a node sends an "am I freshest?" query, the node receiving the
+ * query checks their electable list to make sure that no one else is
+ * electable AND higher priority. If this check passes, the node will
+ * return an "ok" response, if not, it will veto.
+ *
+ * If a node is primary and there is another node with higher priority
+ * on the electable list (i.e., it must be synced to within 10 seconds
+ * of the current primary), the node (or nodes) with connections to both
+ * the primary and the secondary with higher priority will issue
+ * replSetStepDown requests to the primary to allow the higher-priority
+ * node to take over.
+ */
+ void addToElectable(const unsigned m) { lock lk(this); _electableSet.insert(m); }
+ void rmFromElectable(const unsigned m) { lock lk(this); _electableSet.erase(m); }
+ bool iAmElectable() { lock lk(this); return _electableSet.find(_self->id()) != _electableSet.end(); }
+ bool isElectable(const unsigned id) { lock lk(this); return _electableSet.find(id) != _electableSet.end(); }
+ Member* getMostElectable();
+ protected:
+ /**
+ * Load a new config as the replica set's main config.
+ *
+ * If there is a "simple" change (just adding a node), this shortcuts
+ * the config. Returns true if the config was changed. Returns false
+ * if the config doesn't include a this node. Throws an exception if
+ * something goes very wrong.
+ *
+ * Behavior to note:
+ * - locks this
+ * - intentionally leaks the old _cfg and any old _members (if the
+ * change isn't strictly additive)
+ */
+ bool initFromConfig(ReplSetConfig& c, bool reconf=false);
+ void _fillIsMaster(BSONObjBuilder&);
+ void _fillIsMasterHost(const Member*, vector<string>&, vector<string>&, vector<string>&);
+ const ReplSetConfig& config() { return *_cfg; }
+ string name() const { return _name; } /* @return replica set's logical name */
+ MemberState state() const { return box.getState(); }
+ void _fatal();
+ void _getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const;
+ void _summarizeAsHtml(stringstream&) const;
+ void _summarizeStatus(BSONObjBuilder&) const; // for replSetGetStatus command
+
+ /* throws exception if a problem initializing. */
+ ReplSetImpl(ReplSetCmdline&);
+
+ /* call afer constructing to start - returns fairly quickly after launching its threads */
+ void _go();
+
+ private:
+ string _name;
+ const vector<HostAndPort> *_seeds;
+ ReplSetConfig *_cfg;
+
+ /**
+ * Finds the configuration with the highest version number and attempts
+ * load it.
+ */
+ bool _loadConfigFinish(vector<ReplSetConfig>& v);
+ /**
+ * Gather all possible configs (from command line seeds, our own config
+ * doc, and any hosts listed therein) and try to initiate from the most
+ * recent config we find.
+ */
+ void loadConfig();
+
+ list<HostAndPort> memberHostnames() const;
+ const ReplSetConfig::MemberCfg& myConfig() const { return _config; }
+ bool iAmArbiterOnly() const { return myConfig().arbiterOnly; }
+ bool iAmPotentiallyHot() const {
+ return myConfig().potentiallyHot() && // not an arbiter
+ elect.steppedDown <= time(0) && // not stepped down/frozen
+ state() == MemberState::RS_SECONDARY; // not stale
+ }
+ protected:
+ Member *_self;
+ bool _buildIndexes; // = _self->config().buildIndexes
+ void setSelfTo(Member *); // use this as it sets buildIndexes var
+ private:
+ List1<Member> _members; // all members of the set EXCEPT _self.
+ ReplSetConfig::MemberCfg _config; // config of _self
+ unsigned _id; // _id of _self
+
+ int _maintenanceMode; // if we should stay in recovering state
+ public:
+ // this is called from within a writelock in logOpRS
+ unsigned selfId() const { return _id; }
+ Manager *mgr;
+ GhostSync *ghost;
+ /**
+ * This forces a secondary to go into recovering state and stay there
+ * until this is called again, passing in "false". Multiple threads can
+ * call this and it will leave maintenance mode once all of the callers
+ * have called it again, passing in false.
+ */
+ void setMaintenanceMode(const bool inc);
+ private:
+ Member* head() const { return _members.head(); }
+ public:
+ const Member* findById(unsigned id) const;
+ private:
+ void _getTargets(list<Target>&, int &configVersion);
+ void getTargets(list<Target>&, int &configVersion);
+ void startThreads();
+ friend class FeedbackThread;
+ friend class CmdReplSetElect;
+ friend class Member;
+ friend class Manager;
+ friend class GhostSync;
+ friend class Consensus;
+
+ private:
+ bool initialSyncOplogApplication(const OpTime& applyGTE, const OpTime& minValid);
+ void _syncDoInitialSync();
+ void syncDoInitialSync();
+ void _syncThread();
+ bool tryToGoLiveAsASecondary(OpTime&); // readlocks
+ void syncTail();
+ unsigned _syncRollback(OplogReader& r);
+ void syncRollback(OplogReader& r);
+ void syncFixUp(HowToFixUp& h, OplogReader& r);
+
+ // get an oplog reader for a server with an oplog entry timestamp greater
+ // than or equal to minTS, if set.
+ Member* _getOplogReader(OplogReader& r, const OpTime& minTS);
+
+ // check lastOpTimeWritten against the remote's earliest op, filling in
+ // remoteOldestOp.
+ bool _isStale(OplogReader& r, const OpTime& minTS, BSONObj& remoteOldestOp);
+
+ // keep a list of hosts that we've tried recently that didn't work
+ map<string,time_t> _veto;
+ public:
+ void syncThread();
+ const OpTime lastOtherOpTime() const;
+ };
+
+ class ReplSet : public ReplSetImpl {
+ public:
+ ReplSet(ReplSetCmdline& replSetCmdline) : ReplSetImpl(replSetCmdline) { }
+
+ // for the replSetStepDown command
+ bool stepDown(int secs) { return _stepDown(secs); }
+
+ // for the replSetFreeze command
+ bool freeze(int secs) { return _freeze(secs); }
+
+ string selfFullName() {
+ assert( _self );
+ return _self->fullName();
+ }
+
+ bool buildIndexes() const { return _buildIndexes; }
+
+ /* call after constructing to start - returns fairly quickly after la[unching its threads */
+ void go() { _go(); }
+
+ void fatal() { _fatal(); }
+ bool isPrimary() { return box.getState().primary(); }
+ bool isSecondary() { return box.getState().secondary(); }
+ MemberState state() const { return ReplSetImpl::state(); }
+ string name() const { return ReplSetImpl::name(); }
+ const ReplSetConfig& config() { return ReplSetImpl::config(); }
+ void getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const { _getOplogDiagsAsHtml(server_id,ss); }
+ void summarizeAsHtml(stringstream& ss) const { _summarizeAsHtml(ss); }
+ void summarizeStatus(BSONObjBuilder& b) const { _summarizeStatus(b); }
+ void fillIsMaster(BSONObjBuilder& b) { _fillIsMaster(b); }
+
+ /**
+ * We have a new config (reconfig) - apply it.
+ * @param comment write a no-op comment to the oplog about it. only
+ * makes sense if one is primary and initiating the reconf.
+ *
+ * The slaves are updated when they get a heartbeat indicating the new
+ * config. The comment is a no-op.
+ */
+ void haveNewConfig(ReplSetConfig& c, bool comment);
+
+ /**
+ * Pointer assignment isn't necessarily atomic, so this needs to assure
+ * locking, even though we don't delete old configs.
+ */
+ const ReplSetConfig& getConfig() { return config(); }
+
+ bool lockedByMe() { return RSBase::lockedByMe(); }
+
+ // heartbeat msg to send to others; descriptive diagnostic info
+ string hbmsg() const {
+ if( time(0)-_hbmsgTime > 120 ) return "";
+ return _hbmsg;
+ }
+ };
+
+ /**
+ * Base class for repl set commands. Checks basic things such if we're in
+ * rs mode before the command does its real work.
+ */
+ class ReplSetCommand : public Command {
+ protected:
+ ReplSetCommand(const char * s, bool show=false) : Command(s, show) { }
+ virtual bool slaveOk() const { return true; }
+ virtual bool adminOnly() const { return true; }
+ virtual bool logTheOp() { return false; }
+ virtual LockType locktype() const { return NONE; }
+ virtual void help( stringstream &help ) const { help << "internal"; }
+
+ /**
+ * Some replica set commands call this and then call check(). This is
+ * intentional, as they might do things before theReplSet is initialized
+ * that still need to be checked for auth.
+ */
+ bool checkAuth(string& errmsg, BSONObjBuilder& result) {
+ if( !noauth ) {
+ AuthenticationInfo *ai = cc().getAuthenticationInfo();
+ if (!ai->isAuthorizedForLock("admin", locktype())) {
+ errmsg = "replSet command unauthorized";
+ return false;
+ }
+ }
+ return true;
+ }
+
+ bool check(string& errmsg, BSONObjBuilder& result) {
+ if( !replSet ) {
+ errmsg = "not running with --replSet";
+ if( cmdLine.configsvr ) {
+ result.append("info", "configsvr"); // for shell prompt
+ }
+ return false;
+ }
+
+ if( theReplSet == 0 ) {
+ result.append("startupStatus", ReplSet::startupStatus);
+ string s;
+ errmsg = ReplSet::startupStatusMsg.empty() ? "replset unknown error 2" : ReplSet::startupStatusMsg.get();
+ if( ReplSet::startupStatus == 3 )
+ result.append("info", "run rs.initiate(...) if not yet done for the set");
+ return false;
+ }
+
+ return checkAuth(errmsg, result);
+ }
+ };
+
+ /**
+ * does local authentication
+ * directly authorizes against AuthenticationInfo
+ */
+ void replLocalAuth();
+
+ /** inlines ----------------- */
+
+ inline Member::Member(HostAndPort h, unsigned ord, ReplSetConfig::MemberCfg *c, bool self) :
+ _config(*c), _h(h), _hbinfo(ord) {
+ assert(c);
+ if( self )
+ _hbinfo.health = 1.0;
+ }
+
+}
diff --git a/src/mongo/db/repl/rs_config.cpp b/src/mongo/db/repl/rs_config.cpp
new file mode 100644
index 00000000000..22137773aec
--- /dev/null
+++ b/src/mongo/db/repl/rs_config.cpp
@@ -0,0 +1,662 @@
+// rs_config.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "rs.h"
+#include "../../client/dbclient.h"
+#include "../../client/syncclusterconnection.h"
+#include "../../util/net/hostandport.h"
+#include "../dbhelpers.h"
+#include "connections.h"
+#include "../oplog.h"
+#include "../instance.h"
+#include "../../util/text.h"
+#include <boost/algorithm/string.hpp>
+
+using namespace bson;
+
+namespace mongo {
+
+ void logOpInitiate(const bo&);
+
+ void assertOnlyHas(BSONObj o, const set<string>& fields) {
+ BSONObj::iterator i(o);
+ while( i.more() ) {
+ BSONElement e = i.next();
+ if( !fields.count( e.fieldName() ) ) {
+ uasserted(13434, str::stream() << "unexpected field '" << e.fieldName() << "' in object");
+ }
+ }
+ }
+
+ list<HostAndPort> ReplSetConfig::otherMemberHostnames() const {
+ list<HostAndPort> L;
+ for( vector<MemberCfg>::const_iterator i = members.begin(); i != members.end(); i++ ) {
+ if( !i->h.isSelf() )
+ L.push_back(i->h);
+ }
+ return L;
+ }
+
+ /* comment MUST only be set when initiating the set by the initiator */
+ void ReplSetConfig::saveConfigLocally(bo comment) {
+ checkRsConfig();
+ log() << "replSet info saving a newer config version to local.system.replset" << rsLog;
+ {
+ writelock lk("");
+ Client::Context cx( rsConfigNs );
+ cx.db()->flushFiles(true);
+
+ //theReplSet->lastOpTimeWritten = ??;
+ //rather than above, do a logOp()? probably
+ BSONObj o = asBson();
+ Helpers::putSingletonGod(rsConfigNs.c_str(), o, false/*logOp=false; local db so would work regardless...*/);
+ if( !comment.isEmpty() && (!theReplSet || theReplSet->isPrimary()) )
+ logOpInitiate(comment);
+
+ cx.db()->flushFiles(true);
+ }
+ log() << "replSet saveConfigLocally done" << rsLog;
+ }
+
+ bo ReplSetConfig::MemberCfg::asBson() const {
+ bob b;
+ b << "_id" << _id;
+ b.append("host", h.dynString());
+ if( votes != 1 ) b << "votes" << votes;
+ if( priority != 1.0 ) b << "priority" << priority;
+ if( arbiterOnly ) b << "arbiterOnly" << true;
+ if( slaveDelay ) b << "slaveDelay" << slaveDelay;
+ if( hidden ) b << "hidden" << hidden;
+ if( !buildIndexes ) b << "buildIndexes" << buildIndexes;
+ if( !tags.empty() ) {
+ BSONObjBuilder a;
+ for( map<string,string>::const_iterator i = tags.begin(); i != tags.end(); i++ )
+ a.append((*i).first, (*i).second);
+ b.append("tags", a.done());
+ }
+ return b.obj();
+ }
+
+ void ReplSetConfig::updateMembers(List1<Member> &dest) {
+ for (vector<MemberCfg>::iterator source = members.begin(); source < members.end(); source++) {
+ for( Member *d = dest.head(); d; d = d->next() ) {
+ if (d->fullName() == (*source).h.toString()) {
+ d->configw().groupsw() = (*source).groups();
+ }
+ }
+ }
+ }
+
+ bo ReplSetConfig::asBson() const {
+ bob b;
+ b.append("_id", _id).append("version", version);
+
+ BSONArrayBuilder a;
+ for( unsigned i = 0; i < members.size(); i++ )
+ a.append( members[i].asBson() );
+ b.append("members", a.arr());
+
+ if( !ho.isDefault() || !getLastErrorDefaults.isEmpty() || !rules.empty()) {
+ bob settings;
+ if( !rules.empty() ) {
+ bob modes;
+ for (map<string,TagRule*>::const_iterator it = rules.begin(); it != rules.end(); it++) {
+ bob clauses;
+ vector<TagClause*> r = (*it).second->clauses;
+ for (vector<TagClause*>::iterator it2 = r.begin(); it2 < r.end(); it2++) {
+ clauses << (*it2)->name << (*it2)->target;
+ }
+ modes << (*it).first << clauses.obj();
+ }
+ settings << "getLastErrorModes" << modes.obj();
+ }
+ if( !getLastErrorDefaults.isEmpty() )
+ settings << "getLastErrorDefaults" << getLastErrorDefaults;
+ b << "settings" << settings.obj();
+ }
+
+ return b.obj();
+ }
+
+ static inline void mchk(bool expr) {
+ uassert(13126, "bad Member config", expr);
+ }
+
+ void ReplSetConfig::MemberCfg::check() const {
+ mchk(_id >= 0 && _id <= 255);
+ mchk(priority >= 0 && priority <= 1000);
+ mchk(votes <= 100); // votes >= 0 because it is unsigned
+ uassert(13419, "priorities must be between 0.0 and 100.0", priority >= 0.0 && priority <= 100.0);
+ uassert(13437, "slaveDelay requires priority be zero", slaveDelay == 0 || priority == 0);
+ uassert(13438, "bad slaveDelay value", slaveDelay >= 0 && slaveDelay <= 3600 * 24 * 366);
+ uassert(13439, "priority must be 0 when hidden=true", priority == 0 || !hidden);
+ uassert(13477, "priority must be 0 when buildIndexes=false", buildIndexes || priority == 0);
+ }
+/*
+ string ReplSetConfig::TagSubgroup::toString() const {
+ bool first = true;
+ string result = "\""+name+"\": [";
+ for (set<const MemberCfg*>::const_iterator i = m.begin(); i != m.end(); i++) {
+ if (!first) {
+ result += ", ";
+ }
+ first = false;
+ result += (*i)->h.toString();
+ }
+ return result+"]";
+ }
+ */
+ string ReplSetConfig::TagClause::toString() const {
+ string result = name+": {";
+ for (map<string,TagSubgroup*>::const_iterator i = subgroups.begin(); i != subgroups.end(); i++) {
+//TEMP? result += (*i).second->toString()+", ";
+ }
+ result += "TagClause toString TEMPORARILY DISABLED";
+ return result + "}";
+ }
+
+ string ReplSetConfig::TagRule::toString() const {
+ string result = "{";
+ for (vector<TagClause*>::const_iterator it = clauses.begin(); it < clauses.end(); it++) {
+ result += ((TagClause*)(*it))->toString()+",";
+ }
+ return result+"}";
+ }
+
+ void ReplSetConfig::TagSubgroup::updateLast(const OpTime& op) {
+ RACECHECK
+ if (last < op) {
+ last = op;
+
+ for (vector<TagClause*>::iterator it = clauses.begin(); it < clauses.end(); it++) {
+ (*it)->updateLast(op);
+ }
+ }
+ }
+
+ void ReplSetConfig::TagClause::updateLast(const OpTime& op) {
+ RACECHECK
+ if (last >= op) {
+ return;
+ }
+
+ // check at least n subgroups greater than clause.last
+ int count = 0;
+ map<string,TagSubgroup*>::iterator it;
+ for (it = subgroups.begin(); it != subgroups.end(); it++) {
+ if ((*it).second->last >= op) {
+ count++;
+ }
+ }
+
+ if (count >= actualTarget) {
+ last = op;
+ rule->updateLast(op);
+ }
+ }
+
+ void ReplSetConfig::TagRule::updateLast(const OpTime& op) {
+ OpTime *earliest = (OpTime*)&op;
+ vector<TagClause*>::iterator it;
+
+ for (it = clauses.begin(); it < clauses.end(); it++) {
+ if ((*it)->last < *earliest) {
+ earliest = &(*it)->last;
+ }
+ }
+
+ // rules are simply and-ed clauses, so whatever the most-behind
+ // clause is at is what the rule is at
+ last = *earliest;
+ }
+
+ /** @param o old config
+ @param n new config
+ */
+ /*static*/
+ bool ReplSetConfig::legalChange(const ReplSetConfig& o, const ReplSetConfig& n, string& errmsg) {
+ assert( theReplSet );
+
+ if( o._id != n._id ) {
+ errmsg = "set name may not change";
+ return false;
+ }
+ /* TODO : wonder if we need to allow o.version < n.version only, which is more lenient.
+ if someone had some intermediate config this node doesnt have, that could be
+ necessary. but then how did we become primary? so perhaps we are fine as-is.
+ */
+ if( o.version >= n.version ) {
+ errmsg = str::stream() << "version number must increase, old: "
+ << o.version << " new: " << n.version;
+ return false;
+ }
+
+ map<HostAndPort,const ReplSetConfig::MemberCfg*> old;
+ bool isLocalHost = false;
+ for( vector<ReplSetConfig::MemberCfg>::const_iterator i = o.members.begin(); i != o.members.end(); i++ ) {
+ if (i->h.isLocalHost()) {
+ isLocalHost = true;
+ }
+ old[i->h] = &(*i);
+ }
+ int me = 0;
+ for( vector<ReplSetConfig::MemberCfg>::const_iterator i = n.members.begin(); i != n.members.end(); i++ ) {
+ const ReplSetConfig::MemberCfg& m = *i;
+ if ( (isLocalHost && !m.h.isLocalHost()) || (!isLocalHost && m.h.isLocalHost())) {
+ log() << "reconfig error, cannot switch between localhost and hostnames: "
+ << m.h.toString() << rsLog;
+ uasserted(13645, "hosts cannot switch between localhost and hostname");
+ }
+ if( old.count(m.h) ) {
+ const ReplSetConfig::MemberCfg& oldCfg = *old[m.h];
+ if( oldCfg._id != m._id ) {
+ log() << "replSet reconfig error with member: " << m.h.toString() << rsLog;
+ uasserted(13432, "_id may not change for members");
+ }
+ if( oldCfg.buildIndexes != m.buildIndexes ) {
+ log() << "replSet reconfig error with member: " << m.h.toString() << rsLog;
+ uasserted(13476, "buildIndexes may not change for members");
+ }
+ /* are transitions to and from arbiterOnly guaranteed safe? if not, we should disallow here.
+ there is a test at replsets/replsetarb3.js */
+ if( oldCfg.arbiterOnly != m.arbiterOnly ) {
+ log() << "replSet reconfig error with member: " << m.h.toString() << " arbiterOnly cannot change. remove and readd the member instead " << rsLog;
+ uasserted(13510, "arbiterOnly may not change for members");
+ }
+ }
+ if( m.h.isSelf() )
+ me++;
+ }
+
+ uassert(13433, "can't find self in new replset config", me == 1);
+
+ return true;
+ }
+
+ void ReplSetConfig::clear() {
+ version = -5;
+ _ok = false;
+ }
+
+ void ReplSetConfig::setMajority() {
+ int total = members.size();
+ int nonArbiters = total;
+ int strictMajority = total/2+1;
+
+ for (vector<MemberCfg>::iterator it = members.begin(); it < members.end(); it++) {
+ if ((*it).arbiterOnly) {
+ nonArbiters--;
+ }
+ }
+
+ // majority should be all "normal" members if we have something like 4
+ // arbiters & 3 normal members
+ _majority = (strictMajority > nonArbiters) ? nonArbiters : strictMajority;
+ }
+
+ int ReplSetConfig::getMajority() const {
+ return _majority;
+ }
+
+ void ReplSetConfig::checkRsConfig() const {
+ uassert(13132,
+ str::stream() << "nonmatching repl set name in _id field: " << _id << " vs. " << cmdLine.ourSetName(),
+ _id == cmdLine.ourSetName());
+ uassert(13308, "replSet bad config version #", version > 0);
+ uassert(13133, "replSet bad config no members", members.size() >= 1);
+ uassert(13309, "replSet bad config maximum number of members is 12", members.size() <= 12);
+ {
+ unsigned voters = 0;
+ for( vector<MemberCfg>::const_iterator i = members.begin(); i != members.end(); ++i ) {
+ if( i->votes )
+ voters++;
+ }
+ uassert(13612, "replSet bad config maximum number of voting members is 7", voters <= 7);
+ uassert(13613, "replSet bad config no voting members", voters > 0);
+ }
+ }
+
+ void ReplSetConfig::_populateTagMap(map<string,TagClause> &tagMap) {
+ // create subgroups for each server corresponding to each of
+ // its tags. E.g.:
+ //
+ // A is tagged with {"server" : "A", "dc" : "ny"}
+ // B is tagged with {"server" : "B", "dc" : "ny"}
+ //
+ // At the end of this step, tagMap will contain:
+ //
+ // "server" => {"A" : [A], "B" : [B]}
+ // "dc" => {"ny" : [A,B]}
+
+ for (unsigned i=0; i<members.size(); i++) {
+ MemberCfg member = members[i];
+
+ for (map<string,string>::iterator tag = member.tags.begin(); tag != member.tags.end(); tag++) {
+ string label = (*tag).first;
+ string value = (*tag).second;
+
+ TagClause& clause = tagMap[label];
+ clause.name = label;
+
+ TagSubgroup* subgroup;
+ // search for "ny" in "dc"'s clause
+ if (clause.subgroups.find(value) == clause.subgroups.end()) {
+ clause.subgroups[value] = subgroup = new TagSubgroup(value);
+ }
+ else {
+ subgroup = clause.subgroups[value];
+ }
+
+ subgroup->m.insert(&members[i]);
+ }
+ }
+ }
+
+ void ReplSetConfig::parseRules(const BSONObj& modes) {
+ map<string,TagClause> tagMap;
+ _populateTagMap(tagMap);
+
+ for (BSONObj::iterator i = modes.begin(); i.more(); ) {
+ unsigned int primaryOnly = 0;
+
+ // ruleName : {dc : 2, m : 3}
+ BSONElement rule = i.next();
+ uassert(14046, "getLastErrorMode rules must be objects", rule.type() == mongo::Object);
+
+ TagRule* r = new TagRule();
+
+ BSONObj clauseObj = rule.Obj();
+ for (BSONObj::iterator c = clauseObj.begin(); c.more(); ) {
+ BSONElement clauseElem = c.next();
+ uassert(14829, "getLastErrorMode criteria must be numeric", clauseElem.isNumber());
+
+ // get the clause, e.g., "x.y" : 3
+ const char *criteria = clauseElem.fieldName();
+ int value = clauseElem.numberInt();
+ uassert(14828, str::stream() << "getLastErrorMode criteria must be greater than 0: " << clauseElem, value > 0);
+
+ TagClause* node = new TagClause(tagMap[criteria]);
+
+ int numGroups = node->subgroups.size();
+ uassert(14831, str::stream() << "mode " << clauseObj << " requires "
+ << value << " tagged with " << criteria << ", but only "
+ << numGroups << " with this tag were found", numGroups >= value);
+
+ node->name = criteria;
+ node->target = value;
+ // if any subgroups contain "me", we can decrease the target
+ node->actualTarget = node->target;
+
+ // then we want to add pointers between clause & subgroup
+ for (map<string,TagSubgroup*>::iterator sgs = node->subgroups.begin();
+ sgs != node->subgroups.end(); sgs++) {
+ bool foundMe = false;
+ (*sgs).second->clauses.push_back(node);
+
+ // if this subgroup contains the primary, it's automatically always up-to-date
+ for( set<MemberCfg*>::const_iterator cfg = (*sgs).second->m.begin();
+ cfg != (*sgs).second->m.end();
+ cfg++)
+ {
+ if ((*cfg)->h.isSelf()) {
+ node->actualTarget--;
+ foundMe = true;
+ }
+ }
+
+ for (set<MemberCfg *>::iterator cfg = (*sgs).second->m.begin();
+ !foundMe && cfg != (*sgs).second->m.end(); cfg++) {
+ (*cfg)->groupsw().insert((*sgs).second);
+ }
+ }
+
+ // if all of the members of this clause involve the primary, it's always up-to-date
+ if (node->actualTarget == 0) {
+ node->last = OpTime(INT_MAX, INT_MAX);
+ primaryOnly++;
+ }
+
+ // this is a valid clause, so we want to add it to its rule
+ node->rule = r;
+ r->clauses.push_back(node);
+ }
+
+ // if all of the clauses are satisfied by the primary, this rule is trivially true
+ if (primaryOnly == r->clauses.size()) {
+ r->last = OpTime(INT_MAX, INT_MAX);
+ }
+
+ // if we got here, this is a valid rule
+ LOG(1) << "replSet new rule " << rule.fieldName() << ": " << r->toString() << rsLog;
+ rules[rule.fieldName()] = r;
+ }
+ }
+
+ void ReplSetConfig::from(BSONObj o) {
+ static const string legal[] = {"_id","version", "members","settings"};
+ static const set<string> legals(legal, legal + 4);
+ assertOnlyHas(o, legals);
+
+ md5 = o.md5();
+ _id = o["_id"].String();
+ if( o["version"].ok() ) {
+ version = o["version"].numberInt();
+ uassert(13115, "bad " + rsConfigNs + " config: version", version > 0);
+ }
+
+ set<string> hosts;
+ set<int> ords;
+ vector<BSONElement> members;
+ try {
+ members = o["members"].Array();
+ }
+ catch(...) {
+ uasserted(13131, "replSet error parsing (or missing) 'members' field in config object");
+ }
+
+ unsigned localhosts = 0;
+ for( unsigned i = 0; i < members.size(); i++ ) {
+ BSONObj mobj = members[i].Obj();
+ MemberCfg m;
+ try {
+ static const string legal[] = {
+ "_id","votes","priority","host", "hidden","slaveDelay",
+ "arbiterOnly","buildIndexes","tags","initialSync" // deprecated
+ };
+ static const set<string> legals(legal, legal + 10);
+ assertOnlyHas(mobj, legals);
+
+ try {
+ m._id = (int) mobj["_id"].Number();
+ }
+ catch(...) {
+ /* TODO: use of string exceptions may be problematic for reconfig case! */
+ throw "_id must be numeric";
+ }
+ try {
+ string s = mobj["host"].String();
+ boost::trim(s);
+ m.h = HostAndPort(s);
+ if ( !m.h.hasPort() ) {
+ // make port explicit even if default
+ m.h.setPort(m.h.port());
+ }
+ }
+ catch(...) {
+ throw string("bad or missing host field? ") + mobj.toString();
+ }
+ if( m.h.isLocalHost() )
+ localhosts++;
+ m.arbiterOnly = mobj["arbiterOnly"].trueValue();
+ m.slaveDelay = mobj["slaveDelay"].numberInt();
+ if( mobj.hasElement("hidden") )
+ m.hidden = mobj["hidden"].trueValue();
+ if( mobj.hasElement("buildIndexes") )
+ m.buildIndexes = mobj["buildIndexes"].trueValue();
+ if( mobj.hasElement("priority") )
+ m.priority = mobj["priority"].Number();
+ if( mobj.hasElement("votes") )
+ m.votes = (unsigned) mobj["votes"].Number();
+ if( mobj.hasElement("tags") ) {
+ const BSONObj &t = mobj["tags"].Obj();
+ for (BSONObj::iterator c = t.begin(); c.more(); c.next()) {
+ m.tags[(*c).fieldName()] = (*c).String();
+ }
+ uassert(14827, "arbiters cannot have tags", !m.arbiterOnly || m.tags.empty() );
+ }
+ m.check();
+ }
+ catch( const char * p ) {
+ log() << "replSet cfg parsing exception for members[" << i << "] " << p << rsLog;
+ stringstream ss;
+ ss << "replSet members[" << i << "] " << p;
+ uassert(13107, ss.str(), false);
+ }
+ catch(DBException& e) {
+ log() << "replSet cfg parsing exception for members[" << i << "] " << e.what() << rsLog;
+ stringstream ss;
+ ss << "bad config for member[" << i << "] " << e.what();
+ uassert(13135, ss.str(), false);
+ }
+ if( !(ords.count(m._id) == 0 && hosts.count(m.h.toString()) == 0) ) {
+ log() << "replSet " << o.toString() << rsLog;
+ uassert(13108, "bad replset config -- duplicate hosts in the config object?", false);
+ }
+ hosts.insert(m.h.dynString());
+ ords.insert(m._id);
+ this->members.push_back(m);
+ }
+ uassert(13393, "can't use localhost in repl set member names except when using it for all members", localhosts == 0 || localhosts == members.size());
+ uassert(13117, "bad " + rsConfigNs + " config", !_id.empty());
+
+ if( o["settings"].ok() ) {
+ BSONObj settings = o["settings"].Obj();
+ if( settings["getLastErrorModes"].ok() ) {
+ parseRules(settings["getLastErrorModes"].Obj());
+ }
+ ho.check();
+ try { getLastErrorDefaults = settings["getLastErrorDefaults"].Obj().copy(); }
+ catch(...) { }
+ }
+
+ // figure out the majority for this config
+ setMajority();
+ }
+
+ static inline void configAssert(bool expr) {
+ uassert(13122, "bad repl set config?", expr);
+ }
+
+ ReplSetConfig::ReplSetConfig(BSONObj cfg, bool force) {
+ _constructed = false;
+ clear();
+ from(cfg);
+ if( force ) {
+ version += rand() % 100000 + 10000;
+ }
+ configAssert( version < 0 /*unspecified*/ || (version >= 1) );
+ if( version < 1 )
+ version = 1;
+ _ok = true;
+ _constructed = true;
+ }
+
+ ReplSetConfig::ReplSetConfig(const HostAndPort& h) {
+ LOG(2) << "ReplSetConfig load " << h.toStringLong() << rsLog;
+
+ _constructed = false;
+ clear();
+ int level = 2;
+ DEV level = 0;
+
+ BSONObj cfg;
+ int v = -5;
+ try {
+ if( h.isSelf() ) {
+ ;
+ }
+ else {
+ /* first, make sure other node is configured to be a replset. just to be safe. */
+ string setname = cmdLine.ourSetName();
+ BSONObj cmd = BSON( "replSetHeartbeat" << setname );
+ int theirVersion;
+ BSONObj info;
+ log() << "trying to contact " << h.toString() << rsLog;
+ bool ok = requestHeartbeat(setname, "", h.toString(), info, -2, theirVersion);
+ if( info["rs"].trueValue() ) {
+ // yes, it is a replicate set, although perhaps not yet initialized
+ }
+ else {
+ if( !ok ) {
+ log() << "replSet TEMP !ok heartbeating " << h.toString() << " on cfg load" << rsLog;
+ if( !info.isEmpty() )
+ log() << "replSet info " << h.toString() << " : " << info.toString() << rsLog;
+ return;
+ }
+ {
+ stringstream ss;
+ ss << "replSet error: member " << h.toString() << " is not in --replSet mode";
+ msgassertedNoTrace(13260, ss.str().c_str()); // not caught as not a user exception - we want it not caught
+ //for python err# checker: uassert(13260, "", false);
+ }
+ }
+ }
+
+ v = -4;
+ unsigned long long count = 0;
+ try {
+ ScopedConn conn(h.toString());
+ v = -3;
+ cfg = conn.findOne(rsConfigNs, Query()).getOwned();
+ count = conn.count(rsConfigNs);
+ }
+ catch ( DBException& ) {
+ if ( !h.isSelf() ) {
+ throw;
+ }
+
+ // on startup, socket is not listening yet
+ DBDirectClient cli;
+ cfg = cli.findOne( rsConfigNs, Query() ).getOwned();
+ count = cli.count(rsConfigNs);
+ }
+
+ if( count > 1 )
+ uasserted(13109, str::stream() << "multiple rows in " << rsConfigNs << " not supported host: " << h.toString());
+
+ if( cfg.isEmpty() ) {
+ version = EMPTYCONFIG;
+ return;
+ }
+ version = -1;
+ }
+ catch( DBException& e) {
+ version = v;
+ log(level) << "replSet load config couldn't get from " << h.toString() << ' ' << e.what() << rsLog;
+ return;
+ }
+
+ from(cfg);
+ checkRsConfig();
+ _ok = true;
+ log(level) << "replSet load config ok from " << (h.isSelf() ? "self" : h.toString()) << rsLog;
+ _constructed = true;
+ }
+
+}
diff --git a/src/mongo/db/repl/rs_config.h b/src/mongo/db/repl/rs_config.h
new file mode 100644
index 00000000000..cfe2e86a568
--- /dev/null
+++ b/src/mongo/db/repl/rs_config.h
@@ -0,0 +1,251 @@
+// rs_config.h
+// repl set configuration
+//
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../../util/net/hostandport.h"
+#include "../../util/concurrency/race.h"
+#include "health.h"
+
+namespace mongo {
+ class Member;
+ const string rsConfigNs = "local.system.replset";
+
+ class ReplSetConfig {
+ enum { EMPTYCONFIG = -2 };
+ struct TagSubgroup;
+ public:
+ /**
+ * This contacts the given host and tries to get a config from them.
+ *
+ * This sends a test heartbeat to the host and, if all goes well and the
+ * host has a more recent config, fetches the config and loads it (see
+ * from().
+ *
+ * If it's contacting itself, it skips the heartbeat (for obvious
+ * reasons.) If something is misconfigured, throws an exception. If the
+ * host couldn't be queried or is just blank, ok() will be false.
+ */
+ ReplSetConfig(const HostAndPort& h);
+
+ ReplSetConfig(BSONObj cfg, bool force=false);
+
+ bool ok() const { return _ok; }
+
+ struct TagRule;
+
+ struct MemberCfg {
+ MemberCfg() : _id(-1), votes(1), priority(1.0), arbiterOnly(false), slaveDelay(0), hidden(false), buildIndexes(true) { }
+ int _id; /* ordinal */
+ unsigned votes; /* how many votes this node gets. default 1. */
+ HostAndPort h;
+ double priority; /* 0 means can never be primary */
+ bool arbiterOnly;
+ int slaveDelay; /* seconds. int rather than unsigned for convenient to/front bson conversion. */
+ bool hidden; /* if set, don't advertise to drives in isMaster. for non-primaries (priority 0) */
+ bool buildIndexes; /* if false, do not create any non-_id indexes */
+ map<string,string> tags; /* tagging for data center, rack, etc. */
+ private:
+ set<TagSubgroup*> _groups; // the subgroups this member belongs to
+ public:
+ const set<TagSubgroup*>& groups() const {
+ return _groups;
+ }
+ set<TagSubgroup*>& groupsw() {
+ return _groups;
+ }
+ void check() const; /* check validity, assert if not. */
+ BSONObj asBson() const;
+ bool potentiallyHot() const { return !arbiterOnly && priority > 0; }
+ void updateGroups(const OpTime& last) {
+ RACECHECK
+ for (set<TagSubgroup*>::const_iterator it = groups().begin(); it != groups().end(); it++) {
+ ((TagSubgroup*)(*it))->updateLast(last);
+ }
+ }
+ bool operator==(const MemberCfg& r) const {
+ if (!tags.empty() || !r.tags.empty()) {
+ if (tags.size() != r.tags.size()) {
+ return false;
+ }
+
+ // if they are the same size and not equal, at least one
+ // element in A must be different in B
+ for (map<string,string>::const_iterator lit = tags.begin(); lit != tags.end(); lit++) {
+ map<string,string>::const_iterator rit = r.tags.find((*lit).first);
+
+ if (rit == r.tags.end() || (*lit).second != (*rit).second) {
+ return false;
+ }
+ }
+ }
+
+ return _id==r._id && votes == r.votes && h == r.h && priority == r.priority &&
+ arbiterOnly == r.arbiterOnly && slaveDelay == r.slaveDelay && hidden == r.hidden &&
+ buildIndexes == buildIndexes;
+ }
+ bool operator!=(const MemberCfg& r) const { return !(*this == r); }
+ };
+
+ vector<MemberCfg> members;
+ string _id;
+ int version;
+ HealthOptions ho;
+ string md5;
+ BSONObj getLastErrorDefaults;
+ map<string,TagRule*> rules;
+
+ list<HostAndPort> otherMemberHostnames() const; // except self
+
+ /** @return true if could connect, and there is no cfg object there at all */
+ bool empty() const { return version == EMPTYCONFIG; }
+
+ string toString() const { return asBson().toString(); }
+
+ /** validate the settings. does not call check() on each member, you have to do that separately. */
+ void checkRsConfig() const;
+
+ /** check if modification makes sense */
+ static bool legalChange(const ReplSetConfig& old, const ReplSetConfig& n, string& errmsg);
+
+ //static void receivedNewConfig(BSONObj);
+ void saveConfigLocally(BSONObj comment); // to local db
+ string saveConfigEverywhere(); // returns textual info on what happened
+
+ /**
+ * Update members' groups when the config changes but members stay the same.
+ */
+ void updateMembers(List1<Member> &dest);
+
+ BSONObj asBson() const;
+
+ /**
+ * Getter and setter for _majority. This is almost always
+ * members.size()/2+1, but can be the number of non-arbiter members if
+ * there are more arbiters than non-arbiters (writing to 3 out of 7
+ * servers is safe if 4 of the servers are arbiters).
+ */
+ void setMajority();
+ int getMajority() const;
+
+ bool _constructed;
+ private:
+ bool _ok;
+ int _majority;
+
+ void from(BSONObj);
+ void clear();
+
+ struct TagClause;
+
+ /**
+ * This is a logical grouping of servers. It is pointed to by a set of
+ * servers with a certain tag.
+ *
+ * For example, suppose servers A, B, and C have the tag "dc" : "nyc". If we
+ * have a rule {"dc" : 2}, then we want A _or_ B _or_ C to have the
+ * write for one of the "dc" critiria to be fulfilled, so all three will
+ * point to this subgroup. When one of their oplog-tailing cursors is
+ * updated, this subgroup is updated.
+ */
+ struct TagSubgroup : boost::noncopyable {
+ ~TagSubgroup(); // never called; not defined
+ TagSubgroup(string nm) : name(nm) { }
+ const string name;
+ OpTime last;
+ vector<TagClause*> clauses;
+
+ // this probably won't actually point to valid members after the
+ // subgroup is created, as initFromConfig() makes a copy of the
+ // config
+ set<MemberCfg*> m;
+
+ void updateLast(const OpTime& op);
+
+ //string toString() const;
+
+ /**
+ * If two tags have the same name, they should compare as equal so
+ * that members don't have to update two identical groups on writes.
+ */
+ bool operator() (TagSubgroup& lhs, TagSubgroup& rhs) const {
+ return lhs.name < rhs.name;
+ }
+ };
+
+ /**
+ * An argument in a rule. For example, if we had the rule {dc : 2,
+ * machines : 3}, "dc" : 2 and "machines" : 3 would be two TagClauses.
+ *
+ * Each tag clause has a set of associated subgroups. For example, if
+ * we had "dc" : 2, our subgroups might be "nyc", "sf", and "hk".
+ */
+ struct TagClause {
+ OpTime last;
+ map<string,TagSubgroup*> subgroups;
+ TagRule *rule;
+ string name;
+ /**
+ * If we have get a clause like {machines : 3} and this server is
+ * tagged with "machines", then it's really {machines : 2}, as we
+ * will always be up-to-date. So, target would be 3 and
+ * actualTarget would be 2, in that example.
+ */
+ int target;
+ int actualTarget;
+
+ void updateLast(const OpTime& op);
+ string toString() const;
+ };
+
+ /**
+ * Parses getLastErrorModes.
+ */
+ void parseRules(const BSONObj& modes);
+
+ /**
+ * Create a hash containing every possible clause that could be used in a
+ * rule and the servers related to that clause.
+ *
+ * For example, suppose we have the following servers:
+ * A {"dc" : "ny", "ny" : "rk1"}
+ * B {"dc" : "ny", "ny" : "rk1"}
+ * C {"dc" : "ny", "ny" : "rk2"}
+ * D {"dc" : "sf", "sf" : "rk1"}
+ * E {"dc" : "sf", "sf" : "rk2"}
+ *
+ * This would give us the possible criteria:
+ * "dc" -> {A, B, C},{D, E}
+ * "ny" -> {A, B},{C}
+ * "sf" -> {D},{E}
+ */
+ void _populateTagMap(map<string,TagClause> &tagMap);
+
+ public:
+ struct TagRule {
+ vector<TagClause*> clauses;
+ OpTime last;
+
+ void updateLast(const OpTime& op);
+ string toString() const;
+ };
+ };
+
+}
diff --git a/src/mongo/db/repl/rs_exception.h b/src/mongo/db/repl/rs_exception.h
new file mode 100644
index 00000000000..fc372fc241c
--- /dev/null
+++ b/src/mongo/db/repl/rs_exception.h
@@ -0,0 +1,17 @@
+// @file rs_exception.h
+
+#pragma once
+
+namespace mongo {
+
+ class VoteException : public std::exception {
+ public:
+ const char * what() const throw () { return "VoteException"; }
+ };
+
+ class RetryAfterSleepException : public std::exception {
+ public:
+ const char * what() const throw () { return "RetryAfterSleepException"; }
+ };
+
+}
diff --git a/src/mongo/db/repl/rs_initialsync.cpp b/src/mongo/db/repl/rs_initialsync.cpp
new file mode 100644
index 00000000000..b67c0d71b83
--- /dev/null
+++ b/src/mongo/db/repl/rs_initialsync.cpp
@@ -0,0 +1,271 @@
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../repl.h"
+#include "../client.h"
+#include "../../client/dbclient.h"
+#include "rs.h"
+#include "../oplogreader.h"
+#include "../../util/mongoutils/str.h"
+#include "../dbhelpers.h"
+#include "rs_optime.h"
+#include "../oplog.h"
+
+namespace mongo {
+
+ using namespace mongoutils;
+ using namespace bson;
+
+ void dropAllDatabasesExceptLocal();
+
+ // add try/catch with sleep
+
+ void isyncassert(const string& msg, bool expr) {
+ if( !expr ) {
+ string m = str::stream() << "initial sync " << msg;
+ theReplSet->sethbmsg(m, 0);
+ uasserted(13404, m);
+ }
+ }
+
+ void ReplSetImpl::syncDoInitialSync() {
+ createOplog();
+
+ while( 1 ) {
+ try {
+ _syncDoInitialSync();
+ break;
+ }
+ catch(DBException& e) {
+ sethbmsg("initial sync exception " + e.toString(), 0);
+ sleepsecs(30);
+ }
+ }
+ }
+
+ /* todo : progress metering to sethbmsg. */
+ static bool clone(const char *master, string db) {
+ string err;
+ return cloneFrom(master, err, db, false,
+ /* slave_ok */ true, true, false, /*mayYield*/true, /*mayBeInterrupted*/false);
+ }
+
+ void _logOpObjRS(const BSONObj& op);
+
+ static void emptyOplog() {
+ writelock lk(rsoplog);
+ Client::Context ctx(rsoplog);
+ NamespaceDetails *d = nsdetails(rsoplog);
+
+ // temp
+ if( d && d->stats.nrecords == 0 )
+ return; // already empty, ok.
+
+ LOG(1) << "replSet empty oplog" << rsLog;
+ d->emptyCappedCollection(rsoplog);
+ }
+
+ Member* ReplSetImpl::getMemberToSyncTo() {
+ Member *closest = 0;
+ time_t now = 0;
+ bool buildIndexes = true;
+
+ // wait for 2N pings before choosing a sync target
+ if (_cfg) {
+ int needMorePings = config().members.size()*2 - HeartbeatInfo::numPings;
+
+ if (needMorePings > 0) {
+ OCCASIONALLY log() << "waiting for " << needMorePings << " pings from other members before syncing" << endl;
+ return NULL;
+ }
+
+ buildIndexes = myConfig().buildIndexes;
+ }
+
+ // find the member with the lowest ping time that has more data than me
+ for (Member *m = _members.head(); m; m = m->next()) {
+ if (m->hbinfo().up() &&
+ // make sure members with buildIndexes sync from other members w/indexes
+ (!buildIndexes || (buildIndexes && m->config().buildIndexes)) &&
+ (m->state() == MemberState::RS_PRIMARY ||
+ (m->state() == MemberState::RS_SECONDARY && m->hbinfo().opTime > lastOpTimeWritten)) &&
+ (!closest || m->hbinfo().ping < closest->hbinfo().ping)) {
+
+ map<string,time_t>::iterator vetoed = _veto.find(m->fullName());
+ if (vetoed == _veto.end()) {
+ closest = m;
+ break;
+ }
+
+ if (now == 0) {
+ now = time(0);
+ }
+
+ // if this was on the veto list, check if it was vetoed in the last "while"
+ if ((*vetoed).second < now) {
+ _veto.erase(vetoed);
+ closest = m;
+ break;
+ }
+
+ // if it was recently vetoed, skip
+ log() << "replSet not trying to sync from " << (*vetoed).first
+ << ", it is vetoed for " << ((*vetoed).second - now) << " more seconds" << rsLog;
+ }
+ }
+
+ {
+ lock lk(this);
+
+ if (!closest) {
+ _currentSyncTarget = NULL;
+ return NULL;
+ }
+
+ _currentSyncTarget = closest;
+ }
+
+ sethbmsg( str::stream() << "syncing to: " << closest->fullName(), 0);
+
+ return closest;
+ }
+
+ void ReplSetImpl::veto(const string& host, const unsigned secs) {
+ _veto[host] = time(0)+secs;
+ }
+
+ /**
+ * Do the initial sync for this member.
+ */
+ void ReplSetImpl::_syncDoInitialSync() {
+ sethbmsg("initial sync pending",0);
+
+ // if this is the first node, it may have already become primary
+ if ( box.getState().primary() ) {
+ sethbmsg("I'm already primary, no need for initial sync",0);
+ return;
+ }
+
+ const Member *source = getMemberToSyncTo();
+ if (!source) {
+ sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0);
+ sleepsecs(15);
+ return;
+ }
+
+ string sourceHostname = source->h().toString();
+ OplogReader r;
+ if( !r.connect(sourceHostname) ) {
+ sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0);
+ sleepsecs(15);
+ return;
+ }
+
+ BSONObj lastOp = r.getLastOp(rsoplog);
+ if( lastOp.isEmpty() ) {
+ sethbmsg("initial sync couldn't read remote oplog", 0);
+ sleepsecs(15);
+ return;
+ }
+ OpTime startingTS = lastOp["ts"]._opTime();
+
+ if (replSettings.fastsync) {
+ log() << "fastsync: skipping database clone" << rsLog;
+ }
+ else {
+ sethbmsg("initial sync drop all databases", 0);
+ dropAllDatabasesExceptLocal();
+
+ sethbmsg("initial sync clone all databases", 0);
+
+ list<string> dbs = r.conn()->getDatabaseNames();
+ for( list<string>::iterator i = dbs.begin(); i != dbs.end(); i++ ) {
+ string db = *i;
+ if( db != "local" ) {
+ sethbmsg( str::stream() << "initial sync cloning db: " << db , 0);
+ bool ok;
+ {
+ writelock lk(db);
+ Client::Context ctx(db);
+ ok = clone(sourceHostname.c_str(), db);
+ }
+ if( !ok ) {
+ sethbmsg( str::stream() << "initial sync error clone of " << db << " failed sleeping 5 minutes" ,0);
+ veto(source->fullName(), 600);
+ sleepsecs(300);
+ return;
+ }
+ }
+ }
+ }
+
+ sethbmsg("initial sync query minValid",0);
+
+ /* our cloned copy will be strange until we apply oplog events that occurred
+ through the process. we note that time point here. */
+ BSONObj minValid = r.getLastOp(rsoplog);
+ isyncassert( "getLastOp is empty ", !minValid.isEmpty() );
+ OpTime mvoptime = minValid["ts"]._opTime();
+ assert( !mvoptime.isNull() );
+ assert( mvoptime >= startingTS );
+
+ // apply startingTS..mvoptime portion of the oplog
+ {
+ // note we assume here that this call does not throw
+ if( ! initialSyncOplogApplication(startingTS, mvoptime) ) {
+ log() << "replSet initial sync failed during oplog application phase" << rsLog;
+
+ emptyOplog(); // otherwise we'll be up!
+
+ lastOpTimeWritten = OpTime();
+ lastH = 0;
+
+ log() << "replSet cleaning up [1]" << rsLog;
+ {
+ writelock lk("local.");
+ Client::Context cx( "local." );
+ cx.db()->flushFiles(true);
+ }
+ log() << "replSet cleaning up [2]" << rsLog;
+
+ log() << "replSet initial sync failed will try again" << endl;
+
+ sleepsecs(5);
+ return;
+ }
+ }
+
+ sethbmsg("initial sync finishing up",0);
+
+ assert( !box.getState().primary() ); // wouldn't make sense if we were.
+
+ {
+ writelock lk("local.");
+ Client::Context cx( "local." );
+ cx.db()->flushFiles(true);
+ try {
+ log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog;
+ }
+ catch(...) { }
+ Helpers::putSingleton("local.replset.minvalid", minValid);
+ cx.db()->flushFiles(true);
+ }
+
+ sethbmsg("initial sync done",0);
+ }
+
+}
diff --git a/src/mongo/db/repl/rs_initiate.cpp b/src/mongo/db/repl/rs_initiate.cpp
new file mode 100644
index 00000000000..77bc6c03938
--- /dev/null
+++ b/src/mongo/db/repl/rs_initiate.cpp
@@ -0,0 +1,269 @@
+/* @file rs_initiate.cpp
+ */
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../cmdline.h"
+#include "../commands.h"
+#include "../../util/mmap.h"
+#include "../../util/mongoutils/str.h"
+#include "health.h"
+#include "rs.h"
+#include "rs_config.h"
+#include "../dbhelpers.h"
+#include "../oplog.h"
+
+using namespace bson;
+using namespace mongoutils;
+
+namespace mongo {
+
+ /* called on a reconfig AND on initiate
+ throws
+ @param initial true when initiating
+ */
+ void checkMembersUpForConfigChange(const ReplSetConfig& cfg, BSONObjBuilder& result, bool initial) {
+ int failures = 0, allVotes = 0, allowableFailures = 0;
+ int me = 0;
+ stringstream selfs;
+ for( vector<ReplSetConfig::MemberCfg>::const_iterator i = cfg.members.begin(); i != cfg.members.end(); i++ ) {
+ if( i->h.isSelf() ) {
+ me++;
+ if( me > 1 )
+ selfs << ',';
+ selfs << i->h.toString();
+ if( !i->potentiallyHot() ) {
+ uasserted(13420, "initiation and reconfiguration of a replica set must be sent to a node that can become primary");
+ }
+ }
+ allVotes += i->votes;
+ }
+ allowableFailures = allVotes - (allVotes/2 + 1);
+
+ uassert(13278, "bad config: isSelf is true for multiple hosts: " + selfs.str(), me <= 1); // dups?
+ if( me != 1 ) {
+ stringstream ss;
+ ss << "can't find self in the replset config";
+ if( !cmdLine.isDefaultPort() ) ss << " my port: " << cmdLine.port;
+ if( me != 0 ) ss << " found: " << me;
+ uasserted(13279, ss.str());
+ }
+
+ vector<string> down;
+ for( vector<ReplSetConfig::MemberCfg>::const_iterator i = cfg.members.begin(); i != cfg.members.end(); i++ ) {
+ // we know we're up
+ if (i->h.isSelf()) {
+ continue;
+ }
+
+ BSONObj res;
+ {
+ bool ok = false;
+ try {
+ int theirVersion = -1000;
+ ok = requestHeartbeat(cfg._id, "", i->h.toString(), res, -1, theirVersion, initial/*check if empty*/);
+ if( theirVersion >= cfg.version ) {
+ stringstream ss;
+ ss << "replSet member " << i->h.toString() << " has too new a config version (" << theirVersion << ") to reconfigure";
+ uasserted(13259, ss.str());
+ }
+ }
+ catch(DBException& e) {
+ log() << "replSet cmufcc requestHeartbeat " << i->h.toString() << " : " << e.toString() << rsLog;
+ }
+ catch(...) {
+ log() << "replSet cmufcc error exception in requestHeartbeat?" << rsLog;
+ }
+ if( res.getBoolField("mismatch") )
+ uasserted(13145, "set name does not match the set name host " + i->h.toString() + " expects");
+ if( *res.getStringField("set") ) {
+ if( cfg.version <= 1 ) {
+ // this was to be initiation, no one shoudl be initiated already.
+ uasserted(13256, "member " + i->h.toString() + " is already initiated");
+ }
+ else {
+ // Assure no one has a newer config.
+ if( res["v"].Int() >= cfg.version ) {
+ uasserted(13341, "member " + i->h.toString() + " has a config version >= to the new cfg version; cannot change config");
+ }
+ }
+ }
+ if( !ok && !res["rs"].trueValue() ) {
+ down.push_back(i->h.toString());
+
+ if( !res.isEmpty() ) {
+ /* strange. got a response, but not "ok". log it. */
+ log() << "replSet warning " << i->h.toString() << " replied: " << res.toString() << rsLog;
+ }
+
+ bool allowFailure = false;
+ failures += i->votes;
+ if( !initial && failures <= allowableFailures ) {
+ const Member* m = theReplSet->findById( i->_id );
+ if( m ) {
+ assert( m->h().toString() == i->h.toString() );
+ }
+ // it's okay if the down member isn't part of the config,
+ // we might be adding a new member that isn't up yet
+ allowFailure = true;
+ }
+
+ if( !allowFailure ) {
+ string msg = string("need all members up to initiate, not ok : ") + i->h.toStringLong();
+ if( !initial )
+ msg = string("need most members up to reconfigure, not ok : ") + i->h.toString();
+ uasserted(13144, msg);
+ }
+ }
+ }
+ if( initial ) {
+ bool hasData = res["hasData"].Bool();
+ uassert(13311, "member " + i->h.toString() + " has data already, cannot initiate set. All members except initiator must be empty.",
+ !hasData || i->h.isSelf());
+ }
+ }
+ if (down.size() > 0) {
+ result.append("down", down);
+ }
+ }
+
+ class CmdReplSetInitiate : public ReplSetCommand {
+ public:
+ virtual LockType locktype() const { return NONE; }
+ CmdReplSetInitiate() : ReplSetCommand("replSetInitiate") { }
+ virtual void help(stringstream& h) const {
+ h << "Initiate/christen a replica set.";
+ h << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands";
+ }
+ virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ log() << "replSet replSetInitiate admin command received from client" << rsLog;
+
+ if( !replSet ) {
+ errmsg = "server is not running with --replSet";
+ return false;
+ }
+ if( theReplSet ) {
+ errmsg = "already initialized";
+ result.append("info", "try querying " + rsConfigNs + " to see current configuration");
+ return false;
+ }
+
+ {
+ // just make sure we can get a write lock before doing anything else. we'll reacquire one
+ // later. of course it could be stuck then, but this check lowers the risk if weird things
+ // are up.
+ time_t t = time(0);
+ writelock lk("");
+ if( time(0)-t > 10 ) {
+ errmsg = "took a long time to get write lock, so not initiating. Initiate when server less busy?";
+ return false;
+ }
+
+ /* check that we don't already have an oplog. that could cause issues.
+ it is ok if the initiating member has *other* data than that.
+ */
+ BSONObj o;
+ if( Helpers::getFirst(rsoplog, o) ) {
+ errmsg = rsoplog + string(" is not empty on the initiating member. cannot initiate.");
+ return false;
+ }
+ }
+
+ if( ReplSet::startupStatus == ReplSet::BADCONFIG ) {
+ errmsg = "server already in BADCONFIG state (check logs); not initiating";
+ result.append("info", ReplSet::startupStatusMsg.get());
+ return false;
+ }
+ if( ReplSet::startupStatus != ReplSet::EMPTYCONFIG ) {
+ result.append("startupStatus", ReplSet::startupStatus);
+ errmsg = "all members and seeds must be reachable to initiate set";
+ result.append("info", cmdLine._replSet);
+ return false;
+ }
+
+ BSONObj configObj;
+
+ if( cmdObj["replSetInitiate"].type() != Object ) {
+ result.append("info2", "no configuration explicitly specified -- making one");
+ log() << "replSet info initiate : no configuration specified. Using a default configuration for the set" << rsLog;
+
+ string name;
+ vector<HostAndPort> seeds;
+ set<HostAndPort> seedSet;
+ parseReplsetCmdLine(cmdLine._replSet, name, seeds, seedSet); // may throw...
+
+ bob b;
+ b.append("_id", name);
+ bob members;
+ members.append("0", BSON( "_id" << 0 << "host" << HostAndPort::Me().dynString() ));
+ result.append("me", HostAndPort::Me().toString());
+ for( unsigned i = 0; i < seeds.size(); i++ )
+ members.append(bob::numStr(i+1), BSON( "_id" << i+1 << "host" << seeds[i].toString()));
+ b.appendArray("members", members.obj());
+ configObj = b.obj();
+ log() << "replSet created this configuration for initiation : " << configObj.toString() << rsLog;
+ }
+ else {
+ configObj = cmdObj["replSetInitiate"].Obj();
+ }
+
+ bool parsed = false;
+ try {
+ ReplSetConfig newConfig(configObj);
+ parsed = true;
+
+ if( newConfig.version > 1 ) {
+ errmsg = "can't initiate with a version number greater than 1";
+ return false;
+ }
+
+ log() << "replSet replSetInitiate config object parses ok, " << newConfig.members.size() << " members specified" << rsLog;
+
+ checkMembersUpForConfigChange(newConfig, result, true);
+
+ log() << "replSet replSetInitiate all members seem up" << rsLog;
+
+ createOplog();
+
+ writelock lk("");
+ bo comment = BSON( "msg" << "initiating set");
+ newConfig.saveConfigLocally(comment);
+ log() << "replSet replSetInitiate config now saved locally. Should come online in about a minute." << rsLog;
+ result.append("info", "Config now saved locally. Should come online in about a minute.");
+ ReplSet::startupStatus = ReplSet::SOON;
+ ReplSet::startupStatusMsg.set("Received replSetInitiate - should come online shortly.");
+ }
+ catch( DBException& e ) {
+ log() << "replSet replSetInitiate exception: " << e.what() << rsLog;
+ if( !parsed )
+ errmsg = string("couldn't parse cfg object ") + e.what();
+ else
+ errmsg = string("couldn't initiate : ") + e.what();
+ return false;
+ }
+ catch( string& e2 ) {
+ log() << e2 << rsLog;
+ errmsg = e2;
+ return false;
+ }
+
+ return true;
+ }
+ } cmdReplSetInitiate;
+
+}
diff --git a/src/mongo/db/repl/rs_member.h b/src/mongo/db/repl/rs_member.h
new file mode 100644
index 00000000000..24e593392b6
--- /dev/null
+++ b/src/mongo/db/repl/rs_member.h
@@ -0,0 +1,131 @@
+// @file rsmember.h
+/*
+ * Copyright (C) 2010 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** replica set member */
+
+#pragma once
+
+#include "../../util/concurrency/value.h"
+
+namespace mongo {
+
+
+ /*
+ RS_STARTUP serving still starting up, or still trying to initiate the set
+ RS_PRIMARY this server thinks it is primary
+ RS_SECONDARY this server thinks it is a secondary (slave mode)
+ RS_RECOVERING recovering/resyncing; after recovery usually auto-transitions to secondary
+ RS_FATAL something bad has occurred and server is not completely offline with regard to the replica set. fatal error.
+ RS_STARTUP2 loaded config, still determining who is primary
+ */
+ struct MemberState {
+ enum MS {
+ RS_STARTUP = 0,
+ RS_PRIMARY = 1,
+ RS_SECONDARY = 2,
+ RS_RECOVERING = 3,
+ RS_FATAL = 4,
+ RS_STARTUP2 = 5,
+ RS_UNKNOWN = 6, /* remote node not yet reached */
+ RS_ARBITER = 7,
+ RS_DOWN = 8, /* node not reachable for a report */
+ RS_ROLLBACK = 9
+ } s;
+
+ MemberState(MS ms = RS_UNKNOWN) : s(ms) { }
+ explicit MemberState(int ms) : s((MS) ms) { }
+
+ bool startup() const { return s == RS_STARTUP; }
+ bool primary() const { return s == RS_PRIMARY; }
+ bool secondary() const { return s == RS_SECONDARY; }
+ bool recovering() const { return s == RS_RECOVERING; }
+ bool startup2() const { return s == RS_STARTUP2; }
+ bool fatal() const { return s == RS_FATAL; }
+ bool rollback() const { return s == RS_ROLLBACK; }
+ bool readable() const { return s == RS_PRIMARY || s == RS_SECONDARY; }
+
+ string toString() const;
+
+ bool operator==(const MemberState& r) const { return s == r.s; }
+ bool operator!=(const MemberState& r) const { return s != r.s; }
+ };
+
+ /* this is supposed to be just basic information on a member,
+ and copy constructable. */
+ class HeartbeatInfo {
+ unsigned _id;
+ public:
+ HeartbeatInfo() : _id(0xffffffff), hbstate(MemberState::RS_UNKNOWN), health(-1.0),
+ downSince(0), skew(INT_MIN), authIssue(false), ping(0) { }
+ HeartbeatInfo(unsigned id);
+ unsigned id() const { return _id; }
+ MemberState hbstate;
+ double health;
+ time_t upSince;
+ long long downSince;
+ time_t lastHeartbeat;
+ DiagStr lastHeartbeatMsg;
+ OpTime opTime;
+ int skew;
+ bool authIssue;
+ unsigned int ping; // milliseconds
+ static unsigned int numPings;
+
+ bool up() const { return health > 0; }
+
+ /** health is set to -1 on startup. that means we haven't even checked yet. 0 means we checked and it failed. */
+ bool maybeUp() const { return health != 0; }
+
+ long long timeDown() const; // ms
+
+ /* true if changed in a way of interest to the repl set manager. */
+ bool changed(const HeartbeatInfo& old) const;
+ };
+
+ inline HeartbeatInfo::HeartbeatInfo(unsigned id) :
+ _id(id),
+ authIssue(false),
+ ping(0) {
+ hbstate = MemberState::RS_UNKNOWN;
+ health = -1.0;
+ downSince = 0;
+ lastHeartbeat = upSince = 0;
+ skew = INT_MIN;
+ }
+
+ inline bool HeartbeatInfo::changed(const HeartbeatInfo& old) const {
+ return health != old.health ||
+ hbstate != old.hbstate;
+ }
+
+ inline string MemberState::toString() const {
+ switch ( s ) {
+ case RS_STARTUP: return "STARTUP";
+ case RS_PRIMARY: return "PRIMARY";
+ case RS_SECONDARY: return "SECONDARY";
+ case RS_RECOVERING: return "RECOVERING";
+ case RS_FATAL: return "FATAL";
+ case RS_STARTUP2: return "STARTUP2";
+ case RS_ARBITER: return "ARBITER";
+ case RS_DOWN: return "DOWN";
+ case RS_ROLLBACK: return "ROLLBACK";
+ case RS_UNKNOWN: return "UNKNOWN";
+ }
+ return "";
+ }
+
+}
diff --git a/src/mongo/db/repl/rs_optime.h b/src/mongo/db/repl/rs_optime.h
new file mode 100644
index 00000000000..f0ca56927ad
--- /dev/null
+++ b/src/mongo/db/repl/rs_optime.h
@@ -0,0 +1,58 @@
+// @file rs_optime.h
+
+/*
+ * Copyright (C) 2010 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../../util/optime.h"
+
+namespace mongo {
+
+ const char rsoplog[] = "local.oplog.rs";
+
+ /*
+ class RSOpTime : public OpTime {
+ public:
+ bool initiated() const { return getSecs() != 0; }
+ };*/
+
+ /*struct RSOpTime {
+ unsigned long long ord;
+
+ RSOpTime() : ord(0) { }
+
+ bool initiated() const { return ord > 0; }
+
+ void initiate() {
+ assert( !initiated() );
+ ord = 1000000;
+ }
+
+ ReplTime inc() {
+ DEV assertInWriteLock();
+ return ++ord;
+ }
+
+ string toString() const { return str::stream() << ord; }
+
+ // query the oplog and set the highest value herein. acquires a db read lock. throws.
+ void load();
+ };
+
+ extern RSOpTime rsOpTime;*/
+
+}
diff --git a/src/mongo/db/repl/rs_rollback.cpp b/src/mongo/db/repl/rs_rollback.cpp
new file mode 100644
index 00000000000..10727c59669
--- /dev/null
+++ b/src/mongo/db/repl/rs_rollback.cpp
@@ -0,0 +1,667 @@
+/* @file rs_rollback.cpp
+*
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../client.h"
+#include "../../client/dbclient.h"
+#include "rs.h"
+#include "../repl.h"
+#include "../ops/query.h"
+#include "../cloner.h"
+#include "../ops/update.h"
+#include "../ops/delete.h"
+
+/* Scenarios
+
+ We went offline with ops not replicated out.
+
+ F = node that failed and coming back.
+ P = node that took over, new primary
+
+ #1:
+ F : a b c d e f g
+ P : a b c d q
+
+ The design is "keep P". One could argue here that "keep F" has some merits, however, in most cases P
+ will have significantly more data. Also note that P may have a proper subset of F's stream if there were
+ no subsequent writes.
+
+ For now the model is simply : get F back in sync with P. If P was really behind or something, we should have
+ just chosen not to fail over anyway.
+
+ #2:
+ F : a b c d e f g -> a b c d
+ P : a b c d
+
+ #3:
+ F : a b c d e f g -> a b c d q r s t u v w x z
+ P : a b c d.q r s t u v w x z
+
+ Steps
+ find an event in common. 'd'.
+ undo our events beyond that by:
+ (1) taking copy from other server of those objects
+ (2) do not consider copy valid until we pass reach an optime after when we fetched the new version of object
+ -- i.e., reset minvalid.
+ (3) we could skip operations on objects that are previous in time to our capture of the object as an optimization.
+
+*/
+
+namespace mongo {
+
+ using namespace bson;
+
+ void incRBID();
+
+ class rsfatal : public std::exception {
+ public:
+ virtual const char* what() const throw() { return "replica set fatal exception"; }
+ };
+
+ struct DocID {
+ const char *ns;
+ be _id;
+ bool operator<(const DocID& d) const {
+ int c = strcmp(ns, d.ns);
+ if( c < 0 ) return true;
+ if( c > 0 ) return false;
+ return _id < d._id;
+ }
+ };
+
+ struct HowToFixUp {
+ /* note this is a set -- if there are many $inc's on a single document we need to rollback, we only
+ need to refetch it once. */
+ set<DocID> toRefetch;
+
+ /* collections to drop */
+ set<string> toDrop;
+
+ set<string> collectionsToResync;
+
+ OpTime commonPoint;
+ DiskLoc commonPointOurDiskloc;
+
+ int rbid; // remote server's current rollback sequence #
+ };
+
+ static void refetch(HowToFixUp& h, const BSONObj& ourObj) {
+ const char *op = ourObj.getStringField("op");
+ if( *op == 'n' )
+ return;
+
+ unsigned long long totSize = 0;
+ totSize += ourObj.objsize();
+ if( totSize > 512 * 1024 * 1024 )
+ throw "rollback too large";
+
+ DocID d;
+ // NOTE The assigned ns value may become invalid if we yield.
+ d.ns = ourObj.getStringField("ns");
+ if( *d.ns == 0 ) {
+ log() << "replSet WARNING ignoring op on rollback no ns TODO : " << ourObj.toString() << rsLog;
+ return;
+ }
+
+ bo o = ourObj.getObjectField(*op=='u' ? "o2" : "o");
+ if( o.isEmpty() ) {
+ log() << "replSet warning ignoring op on rollback : " << ourObj.toString() << rsLog;
+ return;
+ }
+
+ if( *op == 'c' ) {
+ be first = o.firstElement();
+ NamespaceString s(d.ns); // foo.$cmd
+ string cmdname = first.fieldName();
+ Command *cmd = Command::findCommand(cmdname.c_str());
+ if( cmd == 0 ) {
+ log() << "replSet warning rollback no suchcommand " << first.fieldName() << " - different mongod versions perhaps?" << rsLog;
+ return;
+ }
+ else {
+ /* findandmodify - tranlated?
+ godinsert?,
+ renamecollection a->b. just resync a & b
+ */
+ if( cmdname == "create" ) {
+ /* Create collection operation
+ { ts: ..., h: ..., op: "c", ns: "foo.$cmd", o: { create: "abc", ... } }
+ */
+ string ns = s.db + '.' + o["create"].String(); // -> foo.abc
+ h.toDrop.insert(ns);
+ return;
+ }
+ else if( cmdname == "drop" ) {
+ string ns = s.db + '.' + first.valuestr();
+ h.collectionsToResync.insert(ns);
+ return;
+ }
+ else if( cmdname == "dropIndexes" || cmdname == "deleteIndexes" ) {
+ /* TODO: this is bad. we simply full resync the collection here, which could be very slow. */
+ log() << "replSet info rollback of dropIndexes is slow in this version of mongod" << rsLog;
+ string ns = s.db + '.' + first.valuestr();
+ h.collectionsToResync.insert(ns);
+ return;
+ }
+ else if( cmdname == "renameCollection" ) {
+ /* TODO: slow. */
+ log() << "replSet info rollback of renameCollection is slow in this version of mongod" << rsLog;
+ string from = first.valuestr();
+ string to = o["to"].String();
+ h.collectionsToResync.insert(from);
+ h.collectionsToResync.insert(to);
+ return;
+ }
+ else if( cmdname == "reIndex" ) {
+ return;
+ }
+ else if( cmdname == "dropDatabase" ) {
+ log() << "replSet error rollback : can't rollback drop database full resync will be required" << rsLog;
+ log() << "replSet " << o.toString() << rsLog;
+ throw rsfatal();
+ }
+ else {
+ log() << "replSet error can't rollback this command yet: " << o.toString() << rsLog;
+ log() << "replSet cmdname=" << cmdname << rsLog;
+ throw rsfatal();
+ }
+ }
+ }
+
+ d._id = o["_id"];
+ if( d._id.eoo() ) {
+ log() << "replSet WARNING ignoring op on rollback no _id TODO : " << d.ns << ' '<< ourObj.toString() << rsLog;
+ return;
+ }
+
+ h.toRefetch.insert(d);
+ }
+
+ int getRBID(DBClientConnection*);
+
+ static void syncRollbackFindCommonPoint(DBClientConnection *them, HowToFixUp& h) {
+ static time_t last;
+ if( time(0)-last < 60 ) {
+ throw "findcommonpoint waiting a while before trying again";
+ }
+ last = time(0);
+
+ assert( d.dbMutex.atLeastReadLocked() );
+ Client::Context c(rsoplog);
+ NamespaceDetails *nsd = nsdetails(rsoplog);
+ assert(nsd);
+ ReverseCappedCursor u(nsd);
+ if( !u.ok() )
+ throw "our oplog empty or unreadable";
+
+ const Query q = Query().sort(reverseNaturalObj);
+ const bo fields = BSON( "ts" << 1 << "h" << 1 );
+
+ //auto_ptr<DBClientCursor> u = us->query(rsoplog, q, 0, 0, &fields, 0, 0);
+
+ h.rbid = getRBID(them);
+ auto_ptr<DBClientCursor> t = them->query(rsoplog, q, 0, 0, &fields, 0, 0);
+
+ if( t.get() == 0 || !t->more() ) throw "remote oplog empty or unreadable";
+
+ BSONObj ourObj = u.current();
+ OpTime ourTime = ourObj["ts"]._opTime();
+ BSONObj theirObj = t->nextSafe();
+ OpTime theirTime = theirObj["ts"]._opTime();
+
+ {
+ long long diff = (long long) ourTime.getSecs() - ((long long) theirTime.getSecs());
+ /* diff could be positive, negative, or zero */
+ log() << "replSet info rollback our last optime: " << ourTime.toStringPretty() << rsLog;
+ log() << "replSet info rollback their last optime: " << theirTime.toStringPretty() << rsLog;
+ log() << "replSet info rollback diff in end of log times: " << diff << " seconds" << rsLog;
+ if( diff > 1800 ) {
+ log() << "replSet rollback too long a time period for a rollback." << rsLog;
+ throw "error not willing to roll back more than 30 minutes of data";
+ }
+ }
+
+ unsigned long long scanned = 0;
+ while( 1 ) {
+ scanned++;
+ /* todo add code to assure no excessive scanning for too long */
+ if( ourTime == theirTime ) {
+ if( ourObj["h"].Long() == theirObj["h"].Long() ) {
+ // found the point back in time where we match.
+ // todo : check a few more just to be careful about hash collisions.
+ log() << "replSet rollback found matching events at " << ourTime.toStringPretty() << rsLog;
+ log() << "replSet rollback findcommonpoint scanned : " << scanned << rsLog;
+ h.commonPoint = ourTime;
+ h.commonPointOurDiskloc = u.currLoc();
+ return;
+ }
+
+ refetch(h, ourObj);
+
+ if( !t->more() ) {
+ log() << "replSet rollback error RS100 reached beginning of remote oplog" << rsLog;
+ log() << "replSet them: " << them->toString() << " scanned: " << scanned << rsLog;
+ log() << "replSet theirTime: " << theirTime.toStringLong() << rsLog;
+ log() << "replSet ourTime: " << ourTime.toStringLong() << rsLog;
+ throw "RS100 reached beginning of remote oplog [2]";
+ }
+ theirObj = t->nextSafe();
+ theirTime = theirObj["ts"]._opTime();
+
+ u.advance();
+ if( !u.ok() ) {
+ log() << "replSet rollback error RS101 reached beginning of local oplog" << rsLog;
+ log() << "replSet them: " << them->toString() << " scanned: " << scanned << rsLog;
+ log() << "replSet theirTime: " << theirTime.toStringLong() << rsLog;
+ log() << "replSet ourTime: " << ourTime.toStringLong() << rsLog;
+ throw "RS101 reached beginning of local oplog [1]";
+ }
+ ourObj = u.current();
+ ourTime = ourObj["ts"]._opTime();
+ }
+ else if( theirTime > ourTime ) {
+ if( !t->more() ) {
+ log() << "replSet rollback error RS100 reached beginning of remote oplog" << rsLog;
+ log() << "replSet them: " << them->toString() << " scanned: " << scanned << rsLog;
+ log() << "replSet theirTime: " << theirTime.toStringLong() << rsLog;
+ log() << "replSet ourTime: " << ourTime.toStringLong() << rsLog;
+ throw "RS100 reached beginning of remote oplog [1]";
+ }
+ theirObj = t->nextSafe();
+ theirTime = theirObj["ts"]._opTime();
+ }
+ else {
+ // theirTime < ourTime
+ refetch(h, ourObj);
+ u.advance();
+ if( !u.ok() ) {
+ log() << "replSet rollback error RS101 reached beginning of local oplog" << rsLog;
+ log() << "replSet them: " << them->toString() << " scanned: " << scanned << rsLog;
+ log() << "replSet theirTime: " << theirTime.toStringLong() << rsLog;
+ log() << "replSet ourTime: " << ourTime.toStringLong() << rsLog;
+ throw "RS101 reached beginning of local oplog [2]";
+ }
+ ourObj = u.current();
+ ourTime = ourObj["ts"]._opTime();
+ }
+ }
+ }
+
+ struct X {
+ const bson::bo *op;
+ bson::bo goodVersionOfObject;
+ };
+
+ static void setMinValid(bo newMinValid) {
+ try {
+ log() << "replSet minvalid=" << newMinValid["ts"]._opTime().toStringLong() << rsLog;
+ }
+ catch(...) { }
+ {
+ Helpers::putSingleton("local.replset.minvalid", newMinValid);
+ Client::Context cx( "local." );
+ cx.db()->flushFiles(true);
+ }
+ }
+
+ void ReplSetImpl::syncFixUp(HowToFixUp& h, OplogReader& r) {
+ DBClientConnection *them = r.conn();
+
+ // fetch all first so we needn't handle interruption in a fancy way
+
+ unsigned long long totSize = 0;
+
+ list< pair<DocID,bo> > goodVersions;
+
+ bo newMinValid;
+
+ /* fetch all the goodVersions of each document from current primary */
+ DocID d;
+ unsigned long long n = 0;
+ try {
+ for( set<DocID>::iterator i = h.toRefetch.begin(); i != h.toRefetch.end(); i++ ) {
+ d = *i;
+
+ assert( !d._id.eoo() );
+
+ {
+ /* TODO : slow. lots of round trips. */
+ n++;
+ bo good= them->findOne(d.ns, d._id.wrap(), NULL, QueryOption_SlaveOk).getOwned();
+ totSize += good.objsize();
+ uassert( 13410, "replSet too much data to roll back", totSize < 300 * 1024 * 1024 );
+
+ // note good might be eoo, indicating we should delete it
+ goodVersions.push_back(pair<DocID,bo>(d,good));
+ }
+ }
+ newMinValid = r.getLastOp(rsoplog);
+ if( newMinValid.isEmpty() ) {
+ sethbmsg("rollback error newMinValid empty?");
+ return;
+ }
+ }
+ catch(DBException& e) {
+ sethbmsg(str::stream() << "rollback re-get objects: " << e.toString(),0);
+ log() << "rollback couldn't re-get ns:" << d.ns << " _id:" << d._id << ' ' << n << '/' << h.toRefetch.size() << rsLog;
+ throw e;
+ }
+
+ MemoryMappedFile::flushAll(true);
+
+ sethbmsg("rollback 3.5");
+ if( h.rbid != getRBID(r.conn()) ) {
+ // our source rolled back itself. so the data we received isn't necessarily consistent.
+ sethbmsg("rollback rbid on source changed during rollback, cancelling this attempt");
+ return;
+ }
+
+ // update them
+ sethbmsg(str::stream() << "rollback 4 n:" << goodVersions.size());
+
+ bool warn = false;
+
+ assert( !h.commonPointOurDiskloc.isNull() );
+
+ mongo::d.dbMutex.assertWriteLocked();
+
+ /* we have items we are writing that aren't from a point-in-time. thus best not to come online
+ until we get to that point in freshness. */
+ setMinValid(newMinValid);
+
+ /** any full collection resyncs required? */
+ if( !h.collectionsToResync.empty() ) {
+ for( set<string>::iterator i = h.collectionsToResync.begin(); i != h.collectionsToResync.end(); i++ ) {
+ string ns = *i;
+ sethbmsg(str::stream() << "rollback 4.1 coll resync " << ns);
+
+ Client::Context c(ns);
+ {
+ bob res;
+ string errmsg;
+ dropCollection(ns, errmsg, res);
+ {
+ dbtemprelease r;
+ bool ok = copyCollectionFromRemote(them->getServerAddress(), ns, errmsg);
+ uassert(15909, str::stream() << "replSet rollback error resyncing collection " << ns << ' ' << errmsg, ok);
+ }
+ }
+ }
+
+ /* we did more reading from primary, so check it again for a rollback (which would mess us up), and
+ make minValid newer.
+ */
+ sethbmsg("rollback 4.2");
+ {
+ string err;
+ try {
+ newMinValid = r.getLastOp(rsoplog);
+ if( newMinValid.isEmpty() ) {
+ err = "can't get minvalid from primary";
+ }
+ else {
+ setMinValid(newMinValid);
+ }
+ }
+ catch (DBException&) {
+ err = "can't get/set minvalid";
+ }
+ if( h.rbid != getRBID(r.conn()) ) {
+ // our source rolled back itself. so the data we received isn't necessarily consistent.
+ // however, we've now done writes. thus we have a problem.
+ err += "rbid at primary changed during resync/rollback";
+ }
+ if( !err.empty() ) {
+ log() << "replSet error rolling back : " << err << ". A full resync will be necessary." << rsLog;
+ /* todo: reset minvalid so that we are permanently in fatal state */
+ /* todo: don't be fatal, but rather, get all the data first. */
+ sethbmsg("rollback error");
+ throw rsfatal();
+ }
+ }
+ sethbmsg("rollback 4.3");
+ }
+
+ sethbmsg("rollback 4.6");
+ /** drop collections to drop before doing individual fixups - that might make things faster below actually if there were subsequent inserts to rollback */
+ for( set<string>::iterator i = h.toDrop.begin(); i != h.toDrop.end(); i++ ) {
+ Client::Context c(*i);
+ try {
+ bob res;
+ string errmsg;
+ log(1) << "replSet rollback drop: " << *i << rsLog;
+ dropCollection(*i, errmsg, res);
+ }
+ catch(...) {
+ log() << "replset rollback error dropping collection " << *i << rsLog;
+ }
+ }
+
+ sethbmsg("rollback 4.7");
+ Client::Context c(rsoplog);
+ NamespaceDetails *oplogDetails = nsdetails(rsoplog);
+ uassert(13423, str::stream() << "replSet error in rollback can't find " << rsoplog, oplogDetails);
+
+ map<string,shared_ptr<RemoveSaver> > removeSavers;
+
+ unsigned deletes = 0, updates = 0;
+ for( list<pair<DocID,bo> >::iterator i = goodVersions.begin(); i != goodVersions.end(); i++ ) {
+ const DocID& d = i->first;
+ bo pattern = d._id.wrap(); // { _id : ... }
+ try {
+ assert( d.ns && *d.ns );
+ if( h.collectionsToResync.count(d.ns) ) {
+ /* we just synced this entire collection */
+ continue;
+ }
+
+ getDur().commitIfNeeded();
+
+ /* keep an archive of items rolled back */
+ shared_ptr<RemoveSaver>& rs = removeSavers[d.ns];
+ if ( ! rs )
+ rs.reset( new RemoveSaver( "rollback" , "" , d.ns ) );
+
+ // todo: lots of overhead in context, this can be faster
+ Client::Context c(d.ns);
+ if( i->second.isEmpty() ) {
+ // wasn't on the primary; delete.
+ /* TODO1.6 : can't delete from a capped collection. need to handle that here. */
+ deletes++;
+
+ NamespaceDetails *nsd = nsdetails(d.ns);
+ if( nsd ) {
+ if( nsd->capped ) {
+ /* can't delete from a capped collection - so we truncate instead. if this item must go,
+ so must all successors!!! */
+ try {
+ /** todo: IIRC cappedTrunateAfter does not handle completely empty. todo. */
+ // this will crazy slow if no _id index.
+ long long start = Listener::getElapsedTimeMillis();
+ DiskLoc loc = Helpers::findOne(d.ns, pattern, false);
+ if( Listener::getElapsedTimeMillis() - start > 200 )
+ log() << "replSet warning roll back slow no _id index for " << d.ns << " perhaps?" << rsLog;
+ //would be faster but requires index: DiskLoc loc = Helpers::findById(nsd, pattern);
+ if( !loc.isNull() ) {
+ try {
+ nsd->cappedTruncateAfter(d.ns, loc, true);
+ }
+ catch(DBException& e) {
+ if( e.getCode() == 13415 ) {
+ // hack: need to just make cappedTruncate do this...
+ nsd->emptyCappedCollection(d.ns);
+ }
+ else {
+ throw;
+ }
+ }
+ }
+ }
+ catch(DBException& e) {
+ log() << "replSet error rolling back capped collection rec " << d.ns << ' ' << e.toString() << rsLog;
+ }
+ }
+ else {
+ try {
+ deletes++;
+ deleteObjects(d.ns, pattern, /*justone*/true, /*logop*/false, /*god*/true, rs.get() );
+ }
+ catch(...) {
+ log() << "replSet error rollback delete failed ns:" << d.ns << rsLog;
+ }
+ }
+ // did we just empty the collection? if so let's check if it even exists on the source.
+ if( nsd->stats.nrecords == 0 ) {
+ try {
+ string sys = cc().database()->name + ".system.namespaces";
+ bo o = them->findOne(sys, QUERY("name"<<d.ns));
+ if( o.isEmpty() ) {
+ // we should drop
+ try {
+ bob res;
+ string errmsg;
+ dropCollection(d.ns, errmsg, res);
+ }
+ catch(...) {
+ log() << "replset error rolling back collection " << d.ns << rsLog;
+ }
+ }
+ }
+ catch(DBException& ) {
+ /* this isn't *that* big a deal, but is bad. */
+ log() << "replSet warning rollback error querying for existence of " << d.ns << " at the primary, ignoring" << rsLog;
+ }
+ }
+ }
+ }
+ else {
+ // todo faster...
+ OpDebug debug;
+ updates++;
+ _updateObjects(/*god*/true, d.ns, i->second, pattern, /*upsert=*/true, /*multi=*/false , /*logtheop=*/false , debug, rs.get() );
+ }
+ }
+ catch(DBException& e) {
+ log() << "replSet exception in rollback ns:" << d.ns << ' ' << pattern.toString() << ' ' << e.toString() << " ndeletes:" << deletes << rsLog;
+ warn = true;
+ }
+ }
+
+ removeSavers.clear(); // this effectively closes all of them
+
+ sethbmsg(str::stream() << "rollback 5 d:" << deletes << " u:" << updates);
+ MemoryMappedFile::flushAll(true);
+ sethbmsg("rollback 6");
+
+ // clean up oplog
+ LOG(2) << "replSet rollback truncate oplog after " << h.commonPoint.toStringPretty() << rsLog;
+ // todo: fatal error if this throws?
+ oplogDetails->cappedTruncateAfter(rsoplog, h.commonPointOurDiskloc, false);
+
+ /* reset cached lastoptimewritten and h value */
+ loadLastOpTimeWritten();
+
+ sethbmsg("rollback 7");
+ MemoryMappedFile::flushAll(true);
+
+ // done
+ if( warn )
+ sethbmsg("issues during syncRollback, see log");
+ else
+ sethbmsg("rollback done");
+ }
+
+ void ReplSetImpl::syncRollback(OplogReader&r) {
+ unsigned s = _syncRollback(r);
+ if( s )
+ sleepsecs(s);
+ }
+
+ unsigned ReplSetImpl::_syncRollback(OplogReader&r) {
+ assert( !lockedByMe() );
+ assert( !d.dbMutex.atLeastReadLocked() );
+
+ sethbmsg("rollback 0");
+
+ writelocktry lk(rsoplog, 20000);
+ if( !lk.got() ) {
+ sethbmsg("rollback couldn't get write lock in a reasonable time");
+ return 2;
+ }
+
+ if( state().secondary() ) {
+ /* by doing this, we will not service reads (return an error as we aren't in secondary staate.
+ that perhaps is moot becasue of the write lock above, but that write lock probably gets deferred
+ or removed or yielded later anyway.
+
+ also, this is better for status reporting - we know what is happening.
+ */
+ changeState(MemberState::RS_ROLLBACK);
+ }
+
+ HowToFixUp how;
+ sethbmsg("rollback 1");
+ {
+ r.resetCursor();
+
+ sethbmsg("rollback 2 FindCommonPoint");
+ try {
+ syncRollbackFindCommonPoint(r.conn(), how);
+ }
+ catch( const char *p ) {
+ sethbmsg(string("rollback 2 error ") + p);
+ return 10;
+ }
+ catch( rsfatal& ) {
+ _fatal();
+ return 2;
+ }
+ catch( DBException& e ) {
+ sethbmsg(string("rollback 2 exception ") + e.toString() + "; sleeping 1 min");
+ dbtemprelease r;
+ sleepsecs(60);
+ throw;
+ }
+ }
+
+ sethbmsg("replSet rollback 3 fixup");
+
+ {
+ incRBID();
+ try {
+ syncFixUp(how, r);
+ }
+ catch( rsfatal& ) {
+ sethbmsg("rollback fixup error");
+ _fatal();
+ return 2;
+ }
+ catch(...) {
+ incRBID(); throw;
+ }
+ incRBID();
+
+ /* success - leave "ROLLBACK" state
+ can go to SECONDARY once minvalid is achieved
+ */
+ changeState(MemberState::RS_RECOVERING);
+ }
+
+ return 0;
+ }
+
+}
diff --git a/src/mongo/db/repl/rs_sync.cpp b/src/mongo/db/repl/rs_sync.cpp
new file mode 100644
index 00000000000..8bac981d951
--- /dev/null
+++ b/src/mongo/db/repl/rs_sync.cpp
@@ -0,0 +1,701 @@
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../client.h"
+#include "../../client/dbclient.h"
+#include "rs.h"
+#include "../repl.h"
+#include "connections.h"
+
+namespace mongo {
+
+ using namespace bson;
+ extern unsigned replSetForceInitialSyncFailure;
+
+ void NOINLINE_DECL blank(const BSONObj& o) {
+ if( *o.getStringField("op") != 'n' ) {
+ log() << "replSet skipping bad op in oplog: " << o.toString() << rsLog;
+ }
+ }
+
+ /* apply the log op that is in param o
+ @return bool success (true) or failure (false)
+ */
+ bool replset::SyncTail::syncApply(const BSONObj &o) {
+ const char *ns = o.getStringField("ns");
+ if ( *ns == '.' || *ns == 0 ) {
+ blank(o);
+ return true;
+ }
+
+ Client::Context ctx(ns);
+ ctx.getClient()->curop()->reset();
+ return !applyOperation_inlock(o);
+ }
+
+ /* initial oplog application, during initial sync, after cloning.
+ @return false on failure.
+ this method returns an error and doesn't throw exceptions (i think).
+ */
+ bool ReplSetImpl::initialSyncOplogApplication(const OpTime& applyGTE, const OpTime& minValid) {
+ Member *source = 0;
+ OplogReader r;
+
+ // keep trying to initial sync from oplog until we run out of targets
+ while ((source = _getOplogReader(r, applyGTE)) != 0) {
+ replset::InitialSync init(source->fullName());
+ if (init.oplogApplication(r, source, applyGTE, minValid)) {
+ return true;
+ }
+
+ r.resetConnection();
+ veto(source->fullName(), 60);
+ log() << "replSet applying oplog from " << source->fullName() << " failed, trying again" << endl;
+ }
+
+ log() << "replSet initial sync error: couldn't find oplog to sync from" << rsLog;
+ return false;
+ }
+
+ bool replset::InitialSync::oplogApplication(OplogReader& r, const Member* source,
+ const OpTime& applyGTE, const OpTime& minValid) {
+
+ const string hn = source->fullName();
+ try {
+ r.tailingQueryGTE( rsoplog, applyGTE );
+ if ( !r.haveCursor() ) {
+ log() << "replSet initial sync oplog query error" << rsLog;
+ return false;
+ }
+
+ {
+ if( !r.more() ) {
+ sethbmsg("replSet initial sync error reading remote oplog");
+ log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog;
+ return false;
+ }
+ bo op = r.next();
+ OpTime t = op["ts"]._opTime();
+ r.putBack(op);
+
+ if( op.firstElementFieldName() == string("$err") ) {
+ log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog;
+ return false;
+ }
+
+ uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() );
+ if( t > applyGTE ) {
+ sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync");
+ log() << "replSet initial sync expected first optime of " << applyGTE << rsLog;
+ log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog;
+ return false;
+ }
+
+ sethbmsg(str::stream() << "initial oplog application from " << hn << " starting at "
+ << t.toStringPretty() << " to " << minValid.toStringPretty());
+ }
+ }
+ catch(DBException& e) {
+ log() << "replSet initial sync failing: " << e.toString() << rsLog;
+ return false;
+ }
+
+ /* we lock outside the loop to avoid the overhead of locking on every operation. */
+ writelock lk("");
+
+ // todo : use exhaust
+ OpTime ts;
+ time_t start = time(0);
+ unsigned long long n = 0;
+ int fails = 0;
+ while( ts < minValid ) {
+ try {
+ // There are some special cases with initial sync (see the catch block), so we
+ // don't want to break out of this while until we've reached minvalid. Thus, we'll
+ // keep trying to requery.
+ if( !r.more() ) {
+ OCCASIONALLY log() << "replSet initial sync oplog: no more records" << endl;
+ sleepsecs(1);
+
+ r.resetCursor();
+ r.tailingQueryGTE(rsoplog, theReplSet->lastOpTimeWritten);
+ if ( !r.haveCursor() ) {
+ if (fails++ > 30) {
+ log() << "replSet initial sync tried to query oplog 30 times, giving up" << endl;
+ return false;
+ }
+ }
+
+ continue;
+ }
+
+ BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */
+ ts = o["ts"]._opTime();
+
+ {
+ if( (source->state() != MemberState::RS_PRIMARY &&
+ source->state() != MemberState::RS_SECONDARY) ||
+ replSetForceInitialSyncFailure ) {
+
+ int f = replSetForceInitialSyncFailure;
+ if( f > 0 ) {
+ replSetForceInitialSyncFailure = f-1;
+ log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog;
+ throw DBException("forced error",0);
+ }
+ log() << "replSet we are now primary" << rsLog;
+ throw DBException("primary changed",0);
+ }
+
+ applyOp(o, applyGTE);
+ }
+
+ if ( ++n % 1000 == 0 ) {
+ time_t now = time(0);
+ if (now - start > 10) {
+ // simple progress metering
+ log() << "replSet initialSyncOplogApplication applied " << n << " operations, synced to "
+ << ts.toStringPretty() << rsLog;
+ start = now;
+ }
+ }
+
+ getDur().commitIfNeeded();
+ }
+ catch (DBException& e) {
+ // Skip duplicate key exceptions.
+ // These are relatively common on initial sync: if a document is inserted
+ // early in the clone step, the insert will be replayed but the document
+ // will probably already have been cloned over.
+ if( e.getCode() == 11000 || e.getCode() == 11001 || e.getCode() == 12582) {
+ continue;
+ }
+
+ // handle cursor not found (just requery)
+ if( e.getCode() == 13127 ) {
+ log() << "replSet requerying oplog after cursor not found condition, ts: " << ts.toStringPretty() << endl;
+ r.resetCursor();
+ r.tailingQueryGTE(rsoplog, ts);
+ if( r.haveCursor() ) {
+ continue;
+ }
+ }
+
+ // TODO: handle server restart
+
+ if( ts <= minValid ) {
+ // didn't make it far enough
+ log() << "replSet initial sync failing, error applying oplog : " << e.toString() << rsLog;
+ return false;
+ }
+
+ // otherwise, whatever, we'll break out of the loop and catch
+ // anything that's really wrong in syncTail
+ }
+ }
+ return true;
+ }
+
+ void replset::InitialSync::applyOp(const BSONObj& o, const OpTime& applyGTE) {
+ OpTime ts = o["ts"]._opTime();
+
+ // optimes before we started copying need not be applied.
+ if( ts >= applyGTE ) {
+ if (!syncApply(o)) {
+ if (shouldRetry(o)) {
+ uassert(15915, "replSet update still fails after adding missing object", syncApply(o));
+ }
+ }
+ }
+
+ // with repl sets we write the ops to our oplog, too
+ _logOpObjRS(o);
+ }
+
+ /* should be in RECOVERING state on arrival here.
+ readlocks
+ @return true if transitioned to SECONDARY
+ */
+ bool ReplSetImpl::tryToGoLiveAsASecondary(OpTime& /*out*/ minvalid) {
+ bool golive = false;
+
+ {
+ lock lk( this );
+
+ if (_maintenanceMode > 0) {
+ // we're not actually going live
+ return true;
+ }
+ }
+
+ {
+ readlock lk("local.replset.minvalid");
+ BSONObj mv;
+ if( Helpers::getSingleton("local.replset.minvalid", mv) ) {
+ minvalid = mv["ts"]._opTime();
+ if( minvalid <= lastOpTimeWritten ) {
+ golive=true;
+ }
+ }
+ else
+ golive = true; /* must have been the original member */
+ }
+ if( golive ) {
+ sethbmsg("");
+ changeState(MemberState::RS_SECONDARY);
+ }
+ return golive;
+ }
+
+ bool ReplSetImpl::_isStale(OplogReader& r, const OpTime& startTs, BSONObj& remoteOldestOp) {
+ remoteOldestOp = r.findOne(rsoplog, Query());
+ OpTime remoteTs = remoteOldestOp["ts"]._opTime();
+ DEV log() << "replSet remoteOldestOp: " << remoteTs.toStringLong() << rsLog;
+ else LOG(3) << "replSet remoteOldestOp: " << remoteTs.toStringLong() << rsLog;
+ DEV {
+ log() << "replSet lastOpTimeWritten: " << lastOpTimeWritten.toStringLong() << rsLog;
+ log() << "replSet our state: " << state().toString() << rsLog;
+ }
+ if( startTs >= remoteTs ) {
+ return false;
+ }
+
+ return true;
+ }
+
+ Member* ReplSetImpl::_getOplogReader(OplogReader& r, const OpTime& minTS) {
+ Member *target = 0, *stale = 0;
+ BSONObj oldest;
+
+ assert(r.conn() == 0);
+
+ while ((target = getMemberToSyncTo()) != 0) {
+ string current = target->fullName();
+
+ if( !r.connect(current) ) {
+ log(2) << "replSet can't connect to " << current << " to read operations" << rsLog;
+ r.resetConnection();
+ veto(current);
+ continue;
+ }
+
+ if( !minTS.isNull() && _isStale(r, minTS, oldest) ) {
+ r.resetConnection();
+ veto(current, 600);
+ stale = target;
+ continue;
+ }
+
+ // if we made it here, the target is up and not stale
+ return target;
+ }
+
+ // the only viable sync target was stale
+ if (stale) {
+ log() << "replSet error RS102 too stale to catch up, at least from " << stale->fullName() << rsLog;
+ log() << "replSet our last optime : " << lastOpTimeWritten.toStringLong() << rsLog;
+ log() << "replSet oldest at " << stale->fullName() << " : " << oldest["ts"]._opTime().toStringLong() << rsLog;
+ log() << "replSet See http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << rsLog;
+
+ // reset minvalid so that we can't become primary prematurely
+ {
+ writelock lk("local.replset.minvalid");
+ Helpers::putSingleton("local.replset.minvalid", oldest);
+ }
+
+ sethbmsg("error RS102 too stale to catch up");
+ changeState(MemberState::RS_RECOVERING);
+ sleepsecs(120);
+ }
+
+ return 0;
+ }
+
+ /* tail an oplog. ok to return, will be re-called. */
+ void ReplSetImpl::syncTail() {
+ // todo : locking vis a vis the mgr...
+ OplogReader r;
+ string hn;
+
+ // find a target to sync from the last op time written
+ Member* target = _getOplogReader(r, lastOpTimeWritten);
+
+ // no server found
+ if (target == 0) {
+ // if there is no one to sync from
+ OpTime minvalid;
+ tryToGoLiveAsASecondary(minvalid);
+ return;
+ }
+
+ r.tailingQueryGTE(rsoplog, lastOpTimeWritten);
+ // if target cut connections between connecting and querying (for
+ // example, because it stepped down) we might not have a cursor
+ if ( !r.haveCursor() ) {
+ return;
+ }
+
+ uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() );
+
+ {
+ if( !r.more() ) {
+ /* maybe we are ahead and need to roll back? */
+ try {
+ bo theirLastOp = r.getLastOp(rsoplog);
+ if( theirLastOp.isEmpty() ) {
+ log() << "replSet error empty query result from " << hn << " oplog" << rsLog;
+ sleepsecs(2);
+ return;
+ }
+ OpTime theirTS = theirLastOp["ts"]._opTime();
+ if( theirTS < lastOpTimeWritten ) {
+ log() << "replSet we are ahead of the primary, will try to roll back" << rsLog;
+ syncRollback(r);
+ return;
+ }
+ /* we're not ahead? maybe our new query got fresher data. best to come back and try again */
+ log() << "replSet syncTail condition 1" << rsLog;
+ sleepsecs(1);
+ }
+ catch(DBException& e) {
+ log() << "replSet error querying " << hn << ' ' << e.toString() << rsLog;
+ veto(target->fullName());
+ sleepsecs(2);
+ }
+ return;
+ }
+
+ BSONObj o = r.nextSafe();
+ OpTime ts = o["ts"]._opTime();
+ long long h = o["h"].numberLong();
+ if( ts != lastOpTimeWritten || h != lastH ) {
+ log() << "replSet our last op time written: " << lastOpTimeWritten.toStringPretty() << rsLog;
+ log() << "replset source's GTE: " << ts.toStringPretty() << rsLog;
+ syncRollback(r);
+ return;
+ }
+ }
+
+ /* we have now checked if we need to rollback and we either don't have to or did it. */
+ {
+ OpTime minvalid;
+ tryToGoLiveAsASecondary(minvalid);
+ }
+
+ while( 1 ) {
+ {
+ Timer timeInWriteLock;
+ writelock lk("");
+ while( 1 ) {
+ if( !r.moreInCurrentBatch() ) {
+ dbtemprelease tempRelease;
+ {
+ // we need to occasionally check some things. between
+ // batches is probably a good time.
+ if( state().recovering() ) { // perhaps we should check this earlier? but not before the rollback checks.
+ /* can we go to RS_SECONDARY state? we can if not too old and if minvalid achieved */
+ OpTime minvalid;
+ bool golive = ReplSetImpl::tryToGoLiveAsASecondary(minvalid);
+ if( golive ) {
+ ;
+ }
+ else {
+ sethbmsg(str::stream() << "still syncing, not yet to minValid optime" << minvalid.toString());
+ }
+ // todo: too stale capability
+ }
+ if( !target->hbinfo().hbstate.readable() ) {
+ return;
+ }
+ }
+ r.more(); // to make the requestmore outside the db lock, which obviously is quite important
+ }
+ if( timeInWriteLock.micros() > 1000 ) {
+ dbtemprelease tempRelease;
+ timeInWriteLock.reset();
+ }
+ if( !r.more() )
+ break;
+ {
+ BSONObj o = r.nextSafe(); // note we might get "not master" at some point
+
+ int sd = myConfig().slaveDelay;
+ // ignore slaveDelay if the box is still initializing. once
+ // it becomes secondary we can worry about it.
+ if( sd && box.getState().secondary() ) {
+ const OpTime ts = o["ts"]._opTime();
+ long long a = ts.getSecs();
+ long long b = time(0);
+ long long lag = b - a;
+ long long sleeptime = sd - lag;
+ if( sleeptime > 0 ) {
+ dbtemprelease tempRelease;
+ uassert(12000, "rs slaveDelay differential too big check clocks and systems", sleeptime < 0x40000000);
+ if( sleeptime < 60 ) {
+ sleepsecs((int) sleeptime);
+ }
+ else {
+ log() << "replSet slavedelay sleep long time: " << sleeptime << rsLog;
+ // sleep(hours) would prevent reconfigs from taking effect & such!
+ long long waitUntil = b + sleeptime;
+ while( 1 ) {
+ sleepsecs(6);
+ if( time(0) >= waitUntil )
+ break;
+
+ if( !target->hbinfo().hbstate.readable() ) {
+ break;
+ }
+
+ if( myConfig().slaveDelay != sd ) // reconf
+ break;
+ }
+ }
+ }
+ } // endif slaveDelay
+
+ d.dbMutex.assertWriteLocked();
+ try {
+ /* if we have become primary, we dont' want to apply things from elsewhere
+ anymore. assumePrimary is in the db lock so we are safe as long as
+ we check after we locked above. */
+ if( box.getState().primary() ) {
+ log(0) << "replSet stopping syncTail we are now primary" << rsLog;
+ return;
+ }
+
+ // TODO: make this whole method a member of SyncTail (SERVER-4444)
+ replset::SyncTail tail("");
+ tail.syncApply(o);
+ _logOpObjRS(o); // with repl sets we write the ops to our oplog too
+ }
+ catch (DBException& e) {
+ sethbmsg(str::stream() << "syncTail: " << e.toString() << ", syncing: " << o);
+ veto(target->fullName(), 300);
+ sleepsecs(30);
+ return;
+ }
+ }
+ } // end while
+ } // end writelock scope
+
+ r.tailCheck();
+ if( !r.haveCursor() ) {
+ LOG(1) << "replSet end syncTail pass with " << hn << rsLog;
+ // TODO : reuse our connection to the primary.
+ return;
+ }
+
+ if( !target->hbinfo().hbstate.readable() ) {
+ return;
+ }
+ // looping back is ok because this is a tailable cursor
+ }
+ }
+
+ void ReplSetImpl::_syncThread() {
+ StateBox::SP sp = box.get();
+ if( sp.state.primary() ) {
+ sleepsecs(1);
+ return;
+ }
+ if( _blockSync || sp.state.fatal() || sp.state.startup() ) {
+ sleepsecs(5);
+ return;
+ }
+
+ /* do we have anything at all? */
+ if( lastOpTimeWritten.isNull() ) {
+ syncDoInitialSync();
+ return; // _syncThread will be recalled, starts from top again in case sync failed.
+ }
+
+ /* we have some data. continue tailing. */
+ syncTail();
+ }
+
+ void ReplSetImpl::syncThread() {
+ while( 1 ) {
+ // After a reconfig, we may not be in the replica set anymore, so
+ // check that we are in the set (and not an arbiter) before
+ // trying to sync with other replicas.
+ if( ! _self ) {
+ log() << "replSet warning did not detect own host and port, not syncing, config: " << theReplSet->config() << rsLog;
+ return;
+ }
+ if( myConfig().arbiterOnly ) {
+ return;
+ }
+
+ try {
+ _syncThread();
+ }
+ catch(DBException& e) {
+ sethbmsg(str::stream() << "syncThread: " << e.toString());
+ sleepsecs(10);
+ }
+ catch(...) {
+ sethbmsg("unexpected exception in syncThread()");
+ // TODO : SET NOT SECONDARY here?
+ sleepsecs(60);
+ }
+ sleepsecs(1);
+
+ /* normally msgCheckNewState gets called periodically, but in a single node repl set there
+ are no heartbeat threads, so we do it here to be sure. this is relevant if the singleton
+ member has done a stepDown() and needs to come back up.
+ */
+ OCCASIONALLY {
+ mgr->send( boost::bind(&Manager::msgCheckNewState, theReplSet->mgr) );
+ }
+ }
+ }
+
+ void startSyncThread() {
+ static int n;
+ if( n != 0 ) {
+ log() << "replSet ERROR : more than one sync thread?" << rsLog;
+ assert( n == 0 );
+ }
+ n++;
+
+ Client::initThread("rsSync");
+ cc().iAmSyncThread(); // for isSyncThread() (which is used not used much, is used in secondary create index code
+ replLocalAuth();
+ theReplSet->syncThread();
+ cc().shutdown();
+ }
+
+ void GhostSync::starting() {
+ Client::initThread("rsGhostSync");
+ replLocalAuth();
+ }
+
+ void ReplSetImpl::blockSync(bool block) {
+ _blockSync = block;
+ if (_blockSync) {
+ // syncing is how we get into SECONDARY state, so we'll be stuck in
+ // RECOVERING until we unblock
+ changeState(MemberState::RS_RECOVERING);
+ }
+ }
+
+ void GhostSync::associateSlave(const BSONObj& id, const int memberId) {
+ const OID rid = id["_id"].OID();
+ rwlock lk( _lock , true );
+ shared_ptr<GhostSlave> &g = _ghostCache[rid];
+ if( g.get() == 0 ) {
+ g.reset( new GhostSlave() );
+ wassert( _ghostCache.size() < 10000 );
+ }
+ GhostSlave &slave = *g;
+ if (slave.init) {
+ LOG(1) << "tracking " << slave.slave->h().toString() << " as " << rid << rsLog;
+ return;
+ }
+
+ slave.slave = (Member*)rs->findById(memberId);
+ if (slave.slave != 0) {
+ slave.init = true;
+ }
+ else {
+ log() << "replset couldn't find a slave with id " << memberId
+ << ", not tracking " << rid << rsLog;
+ }
+ }
+
+ void GhostSync::updateSlave(const mongo::OID& rid, const OpTime& last) {
+ rwlock lk( _lock , false );
+ MAP::iterator i = _ghostCache.find( rid );
+ if ( i == _ghostCache.end() ) {
+ OCCASIONALLY warning() << "couldn't update slave " << rid << " no entry" << rsLog;
+ return;
+ }
+
+ GhostSlave& slave = *(i->second);
+ if (!slave.init) {
+ OCCASIONALLY log() << "couldn't update slave " << rid << " not init" << rsLog;
+ return;
+ }
+
+ ((ReplSetConfig::MemberCfg)slave.slave->config()).updateGroups(last);
+ }
+
+ void GhostSync::percolate(const BSONObj& id, const OpTime& last) {
+ const OID rid = id["_id"].OID();
+ GhostSlave* slave;
+ {
+ rwlock lk( _lock , false );
+
+ MAP::iterator i = _ghostCache.find( rid );
+ if ( i == _ghostCache.end() ) {
+ OCCASIONALLY log() << "couldn't percolate slave " << rid << " no entry" << rsLog;
+ return;
+ }
+
+ slave = i->second.get();
+ if (!slave->init) {
+ OCCASIONALLY log() << "couldn't percolate slave " << rid << " not init" << rsLog;
+ return;
+ }
+ }
+
+ assert(slave->slave);
+
+ const Member *target = rs->_currentSyncTarget;
+ if (!target || rs->box.getState().primary()
+ // we are currently syncing from someone who's syncing from us
+ // the target might end up with a new Member, but s.slave never
+ // changes so we'll compare the names
+ || target == slave->slave || target->fullName() == slave->slave->fullName()) {
+ LOG(1) << "replica set ghost target no good" << endl;
+ return;
+ }
+
+ try {
+ if (!slave->reader.haveCursor()) {
+ if (!slave->reader.connect(id, slave->slave->id(), target->fullName())) {
+ // error message logged in OplogReader::connect
+ return;
+ }
+ slave->reader.ghostQueryGTE(rsoplog, last);
+ }
+
+ LOG(1) << "replSet last: " << slave->last.toString() << " to " << last.toString() << rsLog;
+ if (slave->last > last) {
+ return;
+ }
+
+ while (slave->last <= last) {
+ if (!slave->reader.more()) {
+ // we'll be back
+ return;
+ }
+
+ BSONObj o = slave->reader.nextSafe();
+ slave->last = o["ts"]._opTime();
+ }
+ LOG(2) << "now last is " << slave->last.toString() << rsLog;
+ }
+ catch (DBException& e) {
+ // we'll be back
+ LOG(2) << "replSet ghost sync error: " << e.what() << " for "
+ << slave->slave->fullName() << rsLog;
+ slave->reader.resetConnection();
+ }
+ }
+}
diff --git a/src/mongo/db/repl/test.html b/src/mongo/db/repl/test.html
new file mode 100644
index 00000000000..295ad2ef0e0
--- /dev/null
+++ b/src/mongo/db/repl/test.html
@@ -0,0 +1,11 @@
+<HTML>
+<BODY>
+<!-- see also jstests/rs/ -->
+<iframe src="http://127.0.0.1:28000/_replSet" width="100%" height="50%" frameborder=1>
+</iframe>
+
+<iframe src="http://127.0.0.1:28001/_replSet" width="100%" height="50%" frameborder=1>
+</iframe>
+
+</BODY>
+</HTML>
diff --git a/src/mongo/db/repl/testing.js b/src/mongo/db/repl/testing.js
new file mode 100644
index 00000000000..d741cf3a644
--- /dev/null
+++ b/src/mongo/db/repl/testing.js
@@ -0,0 +1,42 @@
+// helpers for testing repl sets
+// run
+// mongo --shell <host:port> testing.js
+
+cfg = {
+ _id: 'asdf',
+ members: [
+ { _id : 0, host : "dm_hp" },
+ { _id : 2, host : "dm_hp:27002" }
+ ]
+};
+c2 = {
+ _id: 'asdf',
+ members: [
+ { _id: 0, host: "dmthink" },
+ { _id: 2, host: "dmthink:27002" }
+ ]
+};
+
+db = db.getSisterDB("admin");
+local = db.getSisterDB("local");
+
+print("\n\ndb = admin db on localhost:27017");
+print("b = admin on localhost:27002");
+print("rc(x) = db.runCommand(x)");
+print("cfg = samp replset config");
+print("i() = replSetInitiate(cfg)");
+print("ism() = rc('ismaster')");
+print("\n\n");
+
+function rc(c) { return db.runCommand(c); }
+function i() { return rc({ replSetInitiate: cfg }); }
+function ism() { return rc("isMaster"); }
+
+b = 0;
+try {
+ b = new Mongo("localhost:27002").getDB("admin");
+}
+catch (e) {
+ print("\nCouldn't connect to b mongod instance\n");
+}
+
diff --git a/src/mongo/db/repl_block.cpp b/src/mongo/db/repl_block.cpp
new file mode 100644
index 00000000000..1776225505c
--- /dev/null
+++ b/src/mongo/db/repl_block.cpp
@@ -0,0 +1,256 @@
+// repl_block.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "repl.h"
+#include "repl_block.h"
+#include "instance.h"
+#include "dbhelpers.h"
+#include "../util/background.h"
+#include "../util/mongoutils/str.h"
+#include "../client/dbclient.h"
+#include "replutil.h"
+
+//#define REPLDEBUG(x) log() << "replBlock: " << x << endl;
+#define REPLDEBUG(x)
+
+namespace mongo {
+
+ using namespace mongoutils;
+
+ class SlaveTracking : public BackgroundJob {
+ public:
+ string name() const { return "SlaveTracking"; }
+
+ static const char * NS;
+
+ struct Ident {
+
+ Ident(const BSONObj& r, const string& h, const string& n) {
+ BSONObjBuilder b;
+ b.appendElements( r );
+ b.append( "host" , h );
+ b.append( "ns" , n );
+ obj = b.obj();
+ }
+
+ bool operator<( const Ident& other ) const {
+ return obj["_id"].OID() < other.obj["_id"].OID();
+ }
+
+ BSONObj obj;
+ };
+
+ struct Info {
+ Info() : loc(0) {}
+ ~Info() {
+ if ( loc && owned ) {
+ delete loc;
+ }
+ }
+ bool owned; // true if loc is a pointer of our creation (and not a pointer into a MMF)
+ OpTime * loc;
+ };
+
+ SlaveTracking() : _mutex("SlaveTracking") {
+ _dirty = false;
+ _started = false;
+ }
+
+ void run() {
+ Client::initThread( "slaveTracking" );
+ DBDirectClient db;
+ while ( ! inShutdown() ) {
+ sleepsecs( 1 );
+
+ if ( ! _dirty )
+ continue;
+
+ writelock lk(NS);
+
+ list< pair<BSONObj,BSONObj> > todo;
+
+ {
+ scoped_lock mylk(_mutex);
+
+ for ( map<Ident,Info>::iterator i=_slaves.begin(); i!=_slaves.end(); i++ ) {
+ BSONObjBuilder temp;
+ temp.appendTimestamp( "syncedTo" , i->second.loc[0].asDate() );
+ todo.push_back( pair<BSONObj,BSONObj>( i->first.obj.getOwned() ,
+ BSON( "$set" << temp.obj() ).getOwned() ) );
+ }
+ }
+
+ for ( list< pair<BSONObj,BSONObj> >::iterator i=todo.begin(); i!=todo.end(); i++ ) {
+ db.update( NS , i->first , i->second , true );
+ }
+
+ _dirty = false;
+ }
+ }
+
+ void reset() {
+ scoped_lock mylk(_mutex);
+ _slaves.clear();
+ }
+
+ void update( const BSONObj& rid , const string& host , const string& ns , OpTime last ) {
+ REPLDEBUG( host << " " << rid << " " << ns << " " << last );
+
+ scoped_lock mylk(_mutex);
+
+#ifdef _DEBUG
+ MongoFileAllowWrites allowWrites;
+#endif
+
+ Ident ident(rid,host,ns);
+ Info& i = _slaves[ ident ];
+
+ if (theReplSet && theReplSet->isPrimary()) {
+ theReplSet->ghost->updateSlave(ident.obj["_id"].OID(), last);
+ }
+
+ if ( i.loc ) {
+ if( i.owned )
+ i.loc[0] = last;
+ else
+ getDur().setNoJournal(i.loc, &last, sizeof(last));
+ return;
+ }
+
+ d.dbMutex.assertAtLeastReadLocked();
+
+ BSONObj res;
+ if ( Helpers::findOne( NS , ident.obj , res ) ) {
+ assert( res["syncedTo"].type() );
+ i.owned = false;
+ i.loc = (OpTime*)res["syncedTo"].value();
+ getDur().setNoJournal(i.loc, &last, sizeof(last));
+ return;
+ }
+
+ i.owned = true;
+ i.loc = new OpTime(last);
+ _dirty = true;
+
+ if ( ! _started ) {
+ // start background thread here since we definitely need it
+ _started = true;
+ go();
+ }
+
+ }
+
+ bool opReplicatedEnough( OpTime op , BSONElement w ) {
+ RARELY {
+ REPLDEBUG( "looking for : " << op << " w=" << w );
+ }
+
+ if (w.isNumber()) {
+ return replicatedToNum(op, w.numberInt());
+ }
+
+ if (!theReplSet) {
+ return false;
+ }
+
+ string wStr = w.String();
+ if (wStr == "majority") {
+ // use the entire set, including arbiters, to prevent writing
+ // to a majority of the set but not a majority of voters
+ return replicatedToNum(op, theReplSet->config().getMajority());
+ }
+
+ map<string,ReplSetConfig::TagRule*>::const_iterator it = theReplSet->config().rules.find(wStr);
+ uassert(14830, str::stream() << "unrecognized getLastError mode: " << wStr,
+ it != theReplSet->config().rules.end());
+
+ return op <= (*it).second->last;
+ }
+
+ bool replicatedToNum(OpTime& op, int w) {
+ if ( w <= 1 || ! _isMaster() )
+ return true;
+
+ w--; // now this is the # of slaves i need
+ scoped_lock mylk(_mutex);
+ for ( map<Ident,Info>::iterator i=_slaves.begin(); i!=_slaves.end(); i++) {
+ OpTime s = *(i->second.loc);
+ if ( s < op ) {
+ continue;
+ }
+ if ( --w == 0 )
+ return true;
+ }
+ return w <= 0;
+ }
+
+ unsigned getSlaveCount() const {
+ scoped_lock mylk(_mutex);
+
+ return _slaves.size();
+ }
+
+ // need to be careful not to deadlock with this
+ mutable mongo::mutex _mutex;
+ map<Ident,Info> _slaves;
+ bool _dirty;
+ bool _started;
+
+ } slaveTracking;
+
+ const char * SlaveTracking::NS = "local.slaves";
+
+ void updateSlaveLocation( CurOp& curop, const char * ns , OpTime lastOp ) {
+ if ( lastOp.isNull() )
+ return;
+
+ assert( str::startsWith(ns, "local.oplog.") );
+
+ Client * c = curop.getClient();
+ assert(c);
+ BSONObj rid = c->getRemoteID();
+ if ( rid.isEmpty() )
+ return;
+
+ slaveTracking.update( rid , curop.getRemoteString( false ) , ns , lastOp );
+
+ if (theReplSet && !theReplSet->isPrimary()) {
+ // we don't know the slave's port, so we make the replica set keep
+ // a map of rids to slaves
+ log(2) << "percolating " << lastOp.toString() << " from " << rid << endl;
+ theReplSet->ghost->send( boost::bind(&GhostSync::percolate, theReplSet->ghost, rid, lastOp) );
+ }
+ }
+
+ bool opReplicatedEnough( OpTime op , BSONElement w ) {
+ return slaveTracking.opReplicatedEnough( op , w );
+ }
+
+ bool opReplicatedEnough( OpTime op , int w ) {
+ return slaveTracking.replicatedToNum( op , w );
+ }
+
+ void resetSlaveCache() {
+ slaveTracking.reset();
+ }
+
+ unsigned getSlaveCount() {
+ return slaveTracking.getSlaveCount();
+ }
+}
diff --git a/src/mongo/db/repl_block.h b/src/mongo/db/repl_block.h
new file mode 100644
index 00000000000..bb74deea10f
--- /dev/null
+++ b/src/mongo/db/repl_block.h
@@ -0,0 +1,39 @@
+// repl_block.h - blocking on writes for replication
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "client.h"
+#include "curop.h"
+
+/**
+ local.slaves - current location for all slaves
+
+ */
+namespace mongo {
+
+ void updateSlaveLocation( CurOp& curop, const char * oplog_ns , OpTime lastOp );
+
+ /** @return true if op has made it to w servers */
+ bool opReplicatedEnough( OpTime op , int w );
+ bool opReplicatedEnough( OpTime op , BSONElement w );
+
+ void resetSlaveCache();
+ unsigned getSlaveCount();
+}
diff --git a/src/mongo/db/replutil.h b/src/mongo/db/replutil.h
new file mode 100644
index 00000000000..6f4dbb875d2
--- /dev/null
+++ b/src/mongo/db/replutil.h
@@ -0,0 +1,102 @@
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "db.h"
+#include "dbhelpers.h"
+#include "json.h"
+#include "../client/dbclient.h"
+#include "repl.h"
+#include "cmdline.h"
+#include "repl/rs.h"
+#include "ops/query.h"
+
+namespace mongo {
+
+ extern const char *replAllDead;
+
+ /* note we always return true for the "local" namespace.
+
+ we should not allow most operations when not the master
+ also we report not master if we are "dead".
+
+ See also CmdIsMaster.
+
+ If 'client' is not specified, the current client is used.
+ */
+ inline bool _isMaster() {
+ if( replSet ) {
+ if( theReplSet )
+ return theReplSet->isPrimary();
+ return false;
+ }
+
+ if( ! replSettings.slave )
+ return true;
+
+ if ( replAllDead )
+ return false;
+
+ if( replSettings.master ) {
+ // if running with --master --slave, allow.
+ return true;
+ }
+
+ if ( cc().isGod() )
+ return true;
+
+ return false;
+ }
+ inline bool isMaster(const char * dbname = 0) {
+ if( _isMaster() )
+ return true;
+ if ( ! dbname ) {
+ Database *database = cc().database();
+ assert( database );
+ dbname = database->name.c_str();
+ }
+ return strcmp( dbname , "local" ) == 0;
+ }
+ inline bool isMasterNs( const char *ns ) {
+ if ( _isMaster() )
+ return true;
+ assert( ns );
+ if ( ! str::startsWith( ns , "local" ) )
+ return false;
+ return ns[5] == 0 || ns[5] == '.';
+ }
+
+ inline void notMasterUnless(bool expr) {
+ uassert( 10107 , "not master" , expr );
+ }
+
+ /** we allow queries to SimpleSlave's */
+ inline void replVerifyReadsOk(ParsedQuery& pq) {
+ if( replSet ) {
+ /* todo: speed up the secondary case. as written here there are 2 mutex entries, it can b 1. */
+ if( isMaster() ) return;
+ uassert(13435, "not master and slaveOk=false", pq.hasOption(QueryOption_SlaveOk));
+ uassert(13436, "not master or secondary; cannot currently read from this replSet member", theReplSet && theReplSet->isSecondary() );
+ }
+ else {
+ notMasterUnless(isMaster() || pq.hasOption(QueryOption_SlaveOk) || replSettings.slave == SimpleSlave );
+ }
+ }
+
+
+
+} // namespace mongo
diff --git a/src/mongo/db/resource.h b/src/mongo/db/resource.h
new file mode 100644
index 00000000000..9ba1ed26a0c
--- /dev/null
+++ b/src/mongo/db/resource.h
@@ -0,0 +1,16 @@
+//{{NO_DEPENDENCIES}}
+// Microsoft Visual C++ generated include file.
+// Used by db.rc
+//
+#define IDI_ICON2 102
+
+// Next default values for new objects
+//
+#ifdef APSTUDIO_INVOKED
+#ifndef APSTUDIO_READONLY_SYMBOLS
+#define _APS_NEXT_RESOURCE_VALUE 104
+#define _APS_NEXT_COMMAND_VALUE 40001
+#define _APS_NEXT_CONTROL_VALUE 1001
+#define _APS_NEXT_SYMED_VALUE 101
+#endif
+#endif
diff --git a/src/mongo/db/restapi.cpp b/src/mongo/db/restapi.cpp
new file mode 100644
index 00000000000..370051354a2
--- /dev/null
+++ b/src/mongo/db/restapi.cpp
@@ -0,0 +1,294 @@
+/** @file resetapi.cpp
+ web rest api
+*/
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../util/net/miniwebserver.h"
+#include "../util/mongoutils/html.h"
+#include "../util/md5.hpp"
+#include "instance.h"
+#include "dbwebserver.h"
+#include "dbhelpers.h"
+#include "repl.h"
+#include "replutil.h"
+#include "clientcursor.h"
+#include "background.h"
+
+#include "restapi.h"
+
+namespace mongo {
+
+ extern const char *replInfo;
+ bool getInitialSyncCompleted();
+
+ using namespace bson;
+ using namespace mongoutils::html;
+
+ class RESTHandler : public DbWebHandler {
+ public:
+ RESTHandler() : DbWebHandler( "DUMMY REST" , 1000 , true ) {}
+
+ virtual bool handles( const string& url ) const {
+ return
+ url[0] == '/' &&
+ url.find_last_of( '/' ) > 0;
+ }
+
+ virtual void handle( const char *rq, string url, BSONObj params,
+ string& responseMsg, int& responseCode,
+ vector<string>& headers, const SockAddr &from ) {
+
+ string::size_type first = url.find( "/" , 1 );
+ if ( first == string::npos ) {
+ responseCode = 400;
+ return;
+ }
+
+ string method = MiniWebServer::parseMethod( rq );
+ string dbname = url.substr( 1 , first - 1 );
+ string coll = url.substr( first + 1 );
+ string action = "";
+
+ string::size_type last = coll.find_last_of( "/" );
+ if ( last == string::npos ) {
+ action = coll;
+ coll = "_defaultCollection";
+ }
+ else {
+ action = coll.substr( last + 1 );
+ coll = coll.substr( 0 , last );
+ }
+
+ for ( string::size_type i=0; i<coll.size(); i++ )
+ if ( coll[i] == '/' )
+ coll[i] = '.';
+
+ string fullns = MiniWebServer::urlDecode(dbname + "." + coll);
+
+ headers.push_back( (string)"x-action: " + action );
+ headers.push_back( (string)"x-ns: " + fullns );
+
+ bool html = false;
+
+ stringstream ss;
+
+ if ( method == "GET" ) {
+ responseCode = 200;
+ html = handleRESTQuery( fullns , action , params , responseCode , ss );
+ }
+ else if ( method == "POST" ) {
+ responseCode = 201;
+ handlePost( fullns , MiniWebServer::body( rq ) , params , responseCode , ss );
+ }
+ else {
+ responseCode = 400;
+ headers.push_back( "X_err: bad request" );
+ ss << "don't know how to handle a [" << method << "]";
+ out() << "don't know how to handle a [" << method << "]" << endl;
+ }
+
+ if( html )
+ headers.push_back("Content-Type: text/html;charset=utf-8");
+ else
+ headers.push_back("Content-Type: text/plain;charset=utf-8");
+
+ responseMsg = ss.str();
+ }
+
+ bool handleRESTQuery( string ns , string action , BSONObj & params , int & responseCode , stringstream & out ) {
+ Timer t;
+
+ int html = _getOption( params["html"] , 0 );
+ int skip = _getOption( params["skip"] , 0 );
+ int num = _getOption( params["limit"] , _getOption( params["count" ] , 1000 ) ); // count is old, limit is new
+
+ int one = 0;
+ if ( params["one"].type() == String && tolower( params["one"].valuestr()[0] ) == 't' ) {
+ num = 1;
+ one = 1;
+ }
+
+ BSONObjBuilder queryBuilder;
+
+ BSONObjIterator i(params);
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ string name = e.fieldName();
+ if ( ! name.find( "filter_" ) == 0 )
+ continue;
+
+ string field = name.substr(7);
+ const char * val = e.valuestr();
+
+ char * temp;
+
+ // TODO: this is how i guess if something is a number. pretty lame right now
+ double number = strtod( val , &temp );
+ if ( temp != val )
+ queryBuilder.append( field , number );
+ else
+ queryBuilder.append( field , val );
+ }
+
+ BSONObj query = queryBuilder.obj();
+ auto_ptr<DBClientCursor> cursor = db.query( ns.c_str() , query, num , skip );
+ uassert( 13085 , "query failed for dbwebserver" , cursor.get() );
+
+ if ( one ) {
+ if ( cursor->more() ) {
+ BSONObj obj = cursor->next();
+ out << obj.jsonString(Strict,html?1:0) << '\n';
+ }
+ else {
+ responseCode = 404;
+ }
+ return html != 0;
+ }
+
+ if( html ) {
+ string title = string("query ") + ns;
+ out << start(title)
+ << p(title)
+ << "<pre>";
+ }
+ else {
+ out << "{\n";
+ out << " \"offset\" : " << skip << ",\n";
+ out << " \"rows\": [\n";
+ }
+
+ int howMany = 0;
+ while ( cursor->more() ) {
+ if ( howMany++ && html == 0 )
+ out << " ,\n";
+ BSONObj obj = cursor->next();
+ if( html ) {
+ if( out.tellp() > 4 * 1024 * 1024 ) {
+ out << "Stopping output: more than 4MB returned and in html mode\n";
+ break;
+ }
+ out << obj.jsonString(Strict, html?1:0) << "\n\n";
+ }
+ else {
+ if( out.tellp() > 50 * 1024 * 1024 ) // 50MB limit - we are using ram
+ break;
+ out << " " << obj.jsonString();
+ }
+ }
+
+ if( html ) {
+ out << "</pre>\n";
+ if( howMany == 0 ) out << p("Collection is empty");
+ out << _end();
+ }
+ else {
+ out << "\n ],\n\n";
+ out << " \"total_rows\" : " << howMany << " ,\n";
+ out << " \"query\" : " << query.jsonString() << " ,\n";
+ out << " \"millis\" : " << t.millis() << '\n';
+ out << "}\n";
+ }
+
+ return html != 0;
+ }
+
+ // TODO Generate id and revision per couch POST spec
+ void handlePost( string ns, const char *body, BSONObj& params, int & responseCode, stringstream & out ) {
+ try {
+ BSONObj obj = fromjson( body );
+ db.insert( ns.c_str(), obj );
+ }
+ catch ( ... ) {
+ responseCode = 400; // Bad Request. Seems reasonable for now.
+ out << "{ \"ok\" : false }";
+ return;
+ }
+
+ responseCode = 201;
+ out << "{ \"ok\" : true }";
+ }
+
+ int _getOption( BSONElement e , int def ) {
+ if ( e.isNumber() )
+ return e.numberInt();
+ if ( e.type() == String )
+ return atoi( e.valuestr() );
+ return def;
+ }
+
+ DBDirectClient db;
+
+ } restHandler;
+
+ bool RestAdminAccess::haveAdminUsers() const {
+ readlocktryassert rl("admin.system.users", 10000);
+ Client::Context cx( "admin.system.users", dbpath, false );
+ return ! Helpers::isEmpty("admin.system.users", false);
+ }
+
+ BSONObj RestAdminAccess::getAdminUser( const string& username ) const {
+ Client::GodScope gs;
+ readlocktryassert rl("admin.system.users", 10000);
+ Client::Context cx( "admin.system.users" );
+ BSONObj user;
+ if ( Helpers::findOne( "admin.system.users" , BSON( "user" << username ) , user ) )
+ return user.copy();
+ return BSONObj();
+ }
+
+ class LowLevelMongodStatus : public WebStatusPlugin {
+ public:
+ LowLevelMongodStatus() : WebStatusPlugin( "overview" , 5 , "(only reported if can acquire read lock quickly)" ) {}
+
+ virtual void init() {}
+
+ void _gotLock( int millis , stringstream& ss ) {
+ ss << "<pre>\n";
+ ss << "time to get readlock: " << millis << "ms\n";
+ ss << "# databases: " << dbHolder().sizeInfo() << '\n';
+ ss << "# Cursors: " << ClientCursor::numCursors() << '\n';
+ ss << "replication: ";
+ if( *replInfo )
+ ss << "\nreplInfo: " << replInfo << "\n\n";
+ if( replSet ) {
+ ss << a("", "see replSetGetStatus link top of page") << "--replSet </a>" << cmdLine._replSet;
+ }
+ if ( replAllDead )
+ ss << "\n<b>replication replAllDead=" << replAllDead << "</b>\n";
+ else {
+ ss << "\nmaster: " << replSettings.master << '\n';
+ ss << "slave: " << replSettings.slave << '\n';
+ ss << '\n';
+ }
+
+ BackgroundOperation::dump(ss);
+ ss << "</pre>\n";
+ }
+
+ virtual void run( stringstream& ss ) {
+ Timer t;
+ readlocktry lk( "" , 300 );
+ if ( lk.got() ) {
+ _gotLock( t.millis() , ss );
+ }
+ else {
+ ss << "\n<b>timed out getting lock</b>\n";
+ }
+ }
+ } lowLevelMongodStatus;
+}
diff --git a/src/mongo/db/restapi.h b/src/mongo/db/restapi.h
new file mode 100644
index 00000000000..e5ac52083fe
--- /dev/null
+++ b/src/mongo/db/restapi.h
@@ -0,0 +1,34 @@
+/** @file restapi.h
+ */
+
+/**
+* Copyright (C) 2010 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../util/admin_access.h"
+
+namespace mongo {
+
+ class RestAdminAccess : public AdminAccess {
+ public:
+ virtual ~RestAdminAccess() { }
+
+ virtual bool haveAdminUsers() const;
+ virtual BSONObj getAdminUser( const string& username ) const;
+ };
+
+} // namespace mongo
diff --git a/src/mongo/db/scanandorder.cpp b/src/mongo/db/scanandorder.cpp
new file mode 100644
index 00000000000..b5e282a5866
--- /dev/null
+++ b/src/mongo/db/scanandorder.cpp
@@ -0,0 +1,105 @@
+/* scanandorder.cpp
+ Order results (that aren't already indexes and in order.)
+*/
+
+/**
+ * Copyright (C) 2008 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "scanandorder.h"
+
+namespace mongo {
+
+ const unsigned ScanAndOrder::MaxScanAndOrderBytes = 32 * 1024 * 1024;
+
+ void ScanAndOrder::_add(BSONObj& k, BSONObj o, DiskLoc* loc) {
+ if (!loc) {
+ _best.insert(make_pair(k.getOwned(),o.getOwned()));
+ }
+ else {
+ BSONObjBuilder b;
+ b.appendElements(o);
+ b.append("$diskLoc", loc->toBSONObj());
+ _best.insert(make_pair(k.getOwned(), b.obj().getOwned()));
+ }
+ }
+
+ void ScanAndOrder::_addIfBetter(BSONObj& k, BSONObj o, BestMap::iterator i, DiskLoc* loc) {
+ /* todo : we don't correct _approxSize here. */
+ const BSONObj& worstBestKey = i->first;
+ int c = worstBestKey.woCompare(k, _order._spec.keyPattern);
+ if ( c > 0 ) {
+ // k is better, 'upgrade'
+ _best.erase(i);
+ _add(k, o, loc);
+ }
+ }
+
+
+ void ScanAndOrder::add(BSONObj o, DiskLoc* loc) {
+ assert( o.isValid() );
+ BSONObj k;
+ try {
+ k = _order.getKeyFromObject(o);
+ }
+ catch (UserException &e) {
+ if ( e.getCode() == ParallelArraysCode ) { // cannot get keys for parallel arrays
+ // fix lasterror text to be more accurate.
+ uasserted( 15925, "cannot sort with keys that are parallel arrays" );
+ }
+ else
+ throw;
+ }
+
+ if ( k.isEmpty() ) {
+ return;
+ }
+ if ( (int) _best.size() < _limit ) {
+ _approxSize += k.objsize();
+ _approxSize += o.objsize();
+
+ /* note : adjust when bson return limit adjusts. note this limit should be a bit higher. */
+ uassert( 10128 , "too much data for sort() with no index. add an index or specify a smaller limit", _approxSize < MaxScanAndOrderBytes );
+
+ _add(k, o, loc);
+ return;
+ }
+ BestMap::iterator i;
+ assert( _best.end() != _best.begin() );
+ i = _best.end();
+ i--;
+ _addIfBetter(k, o, i, loc);
+ }
+
+
+ void ScanAndOrder::fill(BufBuilder& b, Projection *filter, int& nout ) const {
+ int n = 0;
+ int nFilled = 0;
+ for ( BestMap::const_iterator i = _best.begin(); i != _best.end(); i++ ) {
+ n++;
+ if ( n <= _startFrom )
+ continue;
+ const BSONObj& o = i->second;
+ fillQueryResultFromObj(b, filter, o);
+ nFilled++;
+ if ( nFilled >= _limit )
+ break;
+ uassert( 10129 , "too much data for sort() with no index", b.len() < (int)MaxScanAndOrderBytes ); // appserver limit
+ }
+ nout = nFilled;
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/scanandorder.h b/src/mongo/db/scanandorder.h
new file mode 100644
index 00000000000..33e76f61f67
--- /dev/null
+++ b/src/mongo/db/scanandorder.h
@@ -0,0 +1,111 @@
+/* scanandorder.h
+ Order results (that aren't already indexes and in order.)
+*/
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "indexkey.h"
+#include "queryutil.h"
+#include "projection.h"
+
+namespace mongo {
+
+ /* todo:
+ _ limit amount of data
+ */
+
+ class KeyType : boost::noncopyable {
+ public:
+ IndexSpec _spec;
+ FieldRangeVector _keyCutter;
+ public:
+ KeyType(BSONObj pattern, const FieldRangeSet &frs):
+ _spec((assert(!pattern.isEmpty()),pattern)),
+ _keyCutter(frs, _spec, 1) {
+ }
+
+ /**
+ * @return first key of the object that would be encountered while
+ * scanning index with keySpec 'pattern' using constraints 'frs', or
+ * BSONObj() if no such key.
+ */
+ BSONObj getKeyFromObject(BSONObj o) {
+ return _keyCutter.firstMatch(o);
+ }
+ };
+
+ /* todo:
+ _ respect limit
+ _ check for excess mem usage
+ _ response size limit from runquery; push it up a bit.
+ */
+
+ inline void fillQueryResultFromObj(BufBuilder& bb, Projection *filter, const BSONObj& js, DiskLoc* loc=NULL) {
+ if ( filter ) {
+ BSONObjBuilder b( bb );
+ filter->transform( js , b );
+ if (loc)
+ b.append("$diskLoc", loc->toBSONObj());
+ b.done();
+ }
+ else if (loc) {
+ BSONObjBuilder b( bb );
+ b.appendElements(js);
+ b.append("$diskLoc", loc->toBSONObj());
+ b.done();
+ }
+ else {
+ bb.appendBuf((void*) js.objdata(), js.objsize());
+ }
+ }
+
+ typedef multimap<BSONObj,BSONObj,BSONObjCmp> BestMap;
+ class ScanAndOrder {
+ public:
+ static const unsigned MaxScanAndOrderBytes;
+
+ ScanAndOrder(int startFrom, int limit, BSONObj order, const FieldRangeSet &frs) :
+ _best( BSONObjCmp( order ) ),
+ _startFrom(startFrom), _order(order, frs) {
+ _limit = limit > 0 ? limit + _startFrom : 0x7fffffff;
+ _approxSize = 0;
+ }
+
+ int size() const { return _best.size(); }
+
+ void add(BSONObj o, DiskLoc* loc);
+
+ /* scanning complete. stick the query result in b for n objects. */
+ void fill(BufBuilder& b, Projection *filter, int& nout ) const;
+
+ private:
+
+ void _add(BSONObj& k, BSONObj o, DiskLoc* loc);
+
+ void _addIfBetter(BSONObj& k, BSONObj o, BestMap::iterator i, DiskLoc* loc);
+
+ BestMap _best; // key -> full object
+ int _startFrom;
+ int _limit; // max to send back.
+ KeyType _order;
+ unsigned _approxSize;
+
+ };
+
+} // namespace mongo
diff --git a/src/mongo/db/security.cpp b/src/mongo/db/security.cpp
new file mode 100644
index 00000000000..c9b9bb40326
--- /dev/null
+++ b/src/mongo/db/security.cpp
@@ -0,0 +1,106 @@
+// security.cpp
+
+/**
+ * Copyright (C) 2009 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "security.h"
+#include "security_common.h"
+#include "instance.h"
+#include "client.h"
+#include "curop-inl.h"
+#include "db.h"
+#include "dbhelpers.h"
+
+// this is the _mongod only_ implementation of security.h
+
+namespace mongo {
+
+ bool AuthenticationInfo::_warned = false;
+ /*
+ void AuthenticationInfo::print() const {
+ cout << "AuthenticationInfo: " << this << '\n';
+ for ( MA::const_iterator i=_dbs.begin(); i!=_dbs.end(); i++ ) {
+ cout << "\t" << i->first << "\t" << i->second.level << '\n';
+ }
+ cout << "END" << endl;
+ }
+ */
+
+ string AuthenticationInfo::getUser( const string& dbname ) const {
+ scoped_spinlock lk(_lock);
+
+ MA::const_iterator i = _dbs.find(dbname);
+ if ( i == _dbs.end() )
+ return "";
+
+ return i->second.user;
+ }
+
+
+ bool AuthenticationInfo::_isAuthorizedSpecialChecks( const string& dbname ) const {
+ if ( cc().isGod() )
+ return true;
+
+ if ( isLocalHost ) {
+ Client::GodScope gs;
+ Client::ReadContext ctx("admin.system.users");
+ BSONObj result;
+ if( ! Helpers::getSingleton("admin.system.users", result) ) {
+ if( ! _warned ) {
+ // you could get a few of these in a race, but that's ok
+ _warned = true;
+ log() << "note: no users configured in admin.system.users, allowing localhost access" << endl;
+ }
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ bool CmdAuthenticate::getUserObj(const string& dbname, const string& user, BSONObj& userObj, string& pwd) {
+ if (user == internalSecurity.user) {
+ uassert(15889, "key file must be used to log in with internal user", cmdLine.keyFile);
+ pwd = internalSecurity.pwd;
+ }
+ else {
+ // static BSONObj userPattern = fromjson("{\"user\":1}");
+ string systemUsers = dbname + ".system.users";
+ // OCCASIONALLY Helpers::ensureIndex(systemUsers.c_str(), userPattern, false, "user_1");
+ {
+ BSONObjBuilder b;
+ b << "user" << user;
+ BSONObj query = b.done();
+ if( !Helpers::findOne(systemUsers.c_str(), query, userObj) ) {
+ log() << "auth: couldn't find user " << user << ", " << systemUsers << endl;
+ return false;
+ }
+ }
+
+ pwd = userObj.getStringField("pwd");
+ }
+ return true;
+ }
+
+ bool CmdLogout::run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ AuthenticationInfo *ai = cc().getAuthenticationInfo();
+ ai->logout(dbname);
+ return true;
+ }
+
+} // namespace mongo
+
diff --git a/src/mongo/db/security.h b/src/mongo/db/security.h
new file mode 100755
index 00000000000..f193f305def
--- /dev/null
+++ b/src/mongo/db/security.h
@@ -0,0 +1,113 @@
+// security.h
+
+/**
+* Copyright (C) 2009 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "nonce.h"
+#include "concurrency.h"
+#include "security_common.h"
+#include "../util/concurrency/spin_lock.h"
+
+// this is used by both mongos and mongod
+
+namespace mongo {
+
+ /*
+ * for a particular db
+ * levels
+ * 0 : none
+ * 1 : read
+ * 2 : write
+ */
+ struct Auth {
+
+ enum Level { NONE = 0 , READ = 1 , WRITE = 2 };
+
+ Auth() { level = NONE; }
+ Level level;
+ string user;
+ };
+
+ class AuthenticationInfo : boost::noncopyable {
+ public:
+ bool isLocalHost;
+
+ AuthenticationInfo(){ isLocalHost = false; }
+ ~AuthenticationInfo() {}
+
+ // -- modifiers ----
+
+ void logout(const string& dbname ) {
+ scoped_spinlock lk(_lock);
+ _dbs.erase(dbname);
+ }
+ void authorize(const string& dbname , const string& user ) {
+ scoped_spinlock lk(_lock);
+ _dbs[dbname].level = Auth::WRITE;
+ _dbs[dbname].user = user;
+ }
+ void authorizeReadOnly(const string& dbname , const string& user ) {
+ scoped_spinlock lk(_lock);
+ _dbs[dbname].level = Auth::READ;
+ _dbs[dbname].user = user;
+ }
+
+ // -- accessors ---
+
+ bool isAuthorized(const string& dbname) const {
+ return _isAuthorized( dbname, Auth::WRITE );
+ }
+
+ bool isAuthorizedReads(const string& dbname) const {
+ return _isAuthorized( dbname, Auth::READ );
+ }
+
+ /**
+ * @param lockType - this is from dbmutex 1 is write, 0 is read
+ */
+ bool isAuthorizedForLock(const string& dbname, int lockType ) const {
+ return _isAuthorized( dbname , lockType > 0 ? Auth::WRITE : Auth::READ );
+ }
+
+ bool isAuthorizedForLevel( const string& dbname , Auth::Level level ) const {
+ return _isAuthorized( dbname , level );
+ }
+
+ string getUser( const string& dbname ) const;
+
+ void print() const;
+
+ protected:
+ /** takes a lock */
+ bool _isAuthorized(const string& dbname, Auth::Level level) const;
+
+ bool _isAuthorizedSingle_inlock(const string& dbname, Auth::Level level) const;
+
+ /** cannot call this locked */
+ bool _isAuthorizedSpecialChecks( const string& dbname ) const ;
+
+ private:
+ mutable SpinLock _lock;
+
+ typedef map<string,Auth> MA;
+ MA _dbs; // dbname -> auth
+
+ static bool _warned;
+ };
+
+} // namespace mongo
diff --git a/src/mongo/db/security_commands.cpp b/src/mongo/db/security_commands.cpp
new file mode 100644
index 00000000000..33dbd597c83
--- /dev/null
+++ b/src/mongo/db/security_commands.cpp
@@ -0,0 +1,150 @@
+// security_commands.cpp
+/*
+ * Copyright (C) 2010 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// security.cpp links with both dbgrid and db. this file db only -- at least for now.
+
+// security.cpp
+
+#include "pch.h"
+#include "security.h"
+#include "../util/md5.hpp"
+#include "json.h"
+#include "pdfile.h"
+#include "db.h"
+#include "dbhelpers.h"
+#include "commands.h"
+#include "jsobj.h"
+#include "client.h"
+
+namespace mongo {
+
+ /* authentication
+
+ system.users contains
+ { user : <username>, pwd : <pwd_digest>, ... }
+
+ getnonce sends nonce to client
+
+ client then sends { authenticate:1, nonce64:<nonce_str>, user:<username>, key:<key> }
+
+ where <key> is md5(<nonce_str><username><pwd_digest_str>) as a string
+ */
+
+ boost::thread_specific_ptr<nonce64> lastNonce;
+
+ class CmdGetNonce : public Command {
+ public:
+ virtual bool requiresAuth() { return false; }
+ virtual bool logTheOp() { return false; }
+ virtual bool slaveOk() const {
+ return true;
+ }
+ void help(stringstream& h) const { h << "internal"; }
+ virtual LockType locktype() const { return NONE; }
+ CmdGetNonce() : Command("getnonce") {}
+ bool run(const string&, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ nonce64 *n = new nonce64(Security::getNonce());
+ stringstream ss;
+ ss << hex << *n;
+ result.append("nonce", ss.str() );
+ lastNonce.reset(n);
+ return true;
+ }
+ } cmdGetNonce;
+
+ CmdLogout cmdLogout;
+
+ bool CmdAuthenticate::run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ log() << " authenticate: " << cmdObj << endl;
+
+ string user = cmdObj.getStringField("user");
+ string key = cmdObj.getStringField("key");
+ string received_nonce = cmdObj.getStringField("nonce");
+
+ if( user.empty() || key.empty() || received_nonce.empty() ) {
+ log() << "field missing/wrong type in received authenticate command "
+ << dbname
+ << endl;
+ errmsg = "auth fails";
+ sleepmillis(10);
+ return false;
+ }
+
+ stringstream digestBuilder;
+
+ {
+ bool reject = false;
+ nonce64 *ln = lastNonce.release();
+ if ( ln == 0 ) {
+ reject = true;
+ log(1) << "auth: no lastNonce" << endl;
+ }
+ else {
+ digestBuilder << hex << *ln;
+ reject = digestBuilder.str() != received_nonce;
+ if ( reject ) log(1) << "auth: different lastNonce" << endl;
+ }
+
+ if ( reject ) {
+ log() << "auth: bad nonce received or getnonce not called. could be a driver bug or a security attack. db:" << dbname << endl;
+ errmsg = "auth fails";
+ sleepmillis(30);
+ return false;
+ }
+ }
+
+ BSONObj userObj;
+ string pwd;
+ if (!getUserObj(dbname, user, userObj, pwd)) {
+ errmsg = "auth fails";
+ return false;
+ }
+
+ md5digest d;
+ {
+ digestBuilder << user << pwd;
+ string done = digestBuilder.str();
+
+ md5_state_t st;
+ md5_init(&st);
+ md5_append(&st, (const md5_byte_t *) done.c_str(), done.size());
+ md5_finish(&st, d);
+ }
+
+ string computed = digestToString( d );
+
+ if ( key != computed ) {
+ log() << "auth: key mismatch " << user << ", ns:" << dbname << endl;
+ errmsg = "auth fails";
+ return false;
+ }
+
+ bool readOnly = userObj["readOnly"].trueValue();
+ authenticate(dbname, user, readOnly );
+
+
+ result.append( "dbname" , dbname );
+ result.append( "user" , user );
+ result.appendBool( "readOnly" , readOnly );
+
+
+ return true;
+ }
+
+ CmdAuthenticate cmdAuthenticate;
+
+} // namespace mongo
diff --git a/src/mongo/db/security_common.cpp b/src/mongo/db/security_common.cpp
new file mode 100644
index 00000000000..a480919c27e
--- /dev/null
+++ b/src/mongo/db/security_common.cpp
@@ -0,0 +1,148 @@
+// security_common.cpp
+/*
+ * Copyright (C) 2010 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * This file contains inter-mongo instance security helpers. Due to the
+ * requirement that it be possible to compile this into mongos and mongod, it
+ * should not depend on much external stuff.
+ */
+
+#include "pch.h"
+#include "security.h"
+#include "security_common.h"
+#include "../client/dbclient.h"
+#include "commands.h"
+#include "nonce.h"
+#include "../util/md5.hpp"
+#include "client_common.h"
+#include <sys/stat.h>
+
+namespace mongo {
+
+ bool noauth = true;
+ AuthInfo internalSecurity;
+
+ bool setUpSecurityKey(const string& filename) {
+ struct stat stats;
+
+ // check obvious file errors
+ if (stat(filename.c_str(), &stats) == -1) {
+ log() << "error getting file " << filename << ": " << strerror(errno) << endl;
+ return false;
+ }
+
+#if !defined(_WIN32)
+ // check permissions: must be X00, where X is >= 4
+ if ((stats.st_mode & (S_IRWXG|S_IRWXO)) != 0) {
+ log() << "permissions on " << filename << " are too open" << endl;
+ return false;
+ }
+#endif
+
+ const unsigned long long fileLength = stats.st_size;
+ if (fileLength < 6 || fileLength > 1024) {
+ log() << " key file " << filename << " has length " << stats.st_size
+ << ", must be between 6 and 1024 chars" << endl;
+ return false;
+ }
+
+ FILE* file = fopen( filename.c_str(), "rb" );
+ if (!file) {
+ log() << "error opening file: " << filename << ": " << strerror(errno) << endl;
+ return false;
+ }
+
+ string str = "";
+
+ // strip key file
+ unsigned long long read = 0;
+ while (read < fileLength) {
+ char buf;
+ int readLength = fread(&buf, 1, 1, file);
+ if (readLength < 1) {
+ log() << "error reading file " << filename << endl;
+ return false;
+ }
+ read++;
+
+ // check for whitespace
+ if ((buf >= '\x09' && buf <= '\x0D') || buf == ' ') {
+ continue;
+ }
+
+ // check valid base64
+ if ((buf < 'A' || buf > 'Z') && (buf < 'a' || buf > 'z') && (buf < '0' || buf > '9') && buf != '+' && buf != '/') {
+ log() << "invalid char in key file " << filename << ": " << buf << endl;
+ return false;
+ }
+
+ str += buf;
+ }
+
+ if (str.size() < 6) {
+ log() << "security key must be at least 6 characters" << endl;
+ return false;
+ }
+
+ log(1) << "security key: " << str << endl;
+
+ // createPWDigest should really not be a member func
+ DBClientConnection conn;
+ internalSecurity.pwd = conn.createPasswordDigest(internalSecurity.user, str);
+
+ return true;
+ }
+
+ void CmdAuthenticate::authenticate(const string& dbname, const string& user, const bool readOnly) {
+ ClientBasic* c = ClientBasic::getCurrent();
+ assert(c);
+ AuthenticationInfo *ai = c->getAuthenticationInfo();
+
+ if ( readOnly ) {
+ ai->authorizeReadOnly( dbname , user );
+ }
+ else {
+ ai->authorize( dbname , user );
+ }
+ }
+
+
+ bool AuthenticationInfo::_isAuthorized(const string& dbname, Auth::Level level) const {
+ {
+ scoped_spinlock lk(_lock);
+
+ if ( _isAuthorizedSingle_inlock( dbname , level ) )
+ return true;
+
+ if ( noauth )
+ return true;
+
+ if ( _isAuthorizedSingle_inlock( "admin" , level ) )
+ return true;
+
+ if ( _isAuthorizedSingle_inlock( "local" , level ) )
+ return true;
+ }
+ return _isAuthorizedSpecialChecks( dbname );
+ }
+
+ bool AuthenticationInfo::_isAuthorizedSingle_inlock(const string& dbname, Auth::Level level) const {
+ MA::const_iterator i = _dbs.find(dbname);
+ return i != _dbs.end() && i->second.level >= level;
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/security_common.h b/src/mongo/db/security_common.h
new file mode 100644
index 00000000000..6615c6e573e
--- /dev/null
+++ b/src/mongo/db/security_common.h
@@ -0,0 +1,85 @@
+// security_common.h
+
+/**
+* Copyright (C) 2009 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "commands.h"
+#include "concurrency.h"
+#include "../util/concurrency/spin_lock.h"
+
+namespace mongo {
+
+ /**
+ * Internal secret key info.
+ */
+ struct AuthInfo {
+ AuthInfo() {
+ user = "__system";
+ }
+ string user;
+ string pwd;
+ };
+
+ // --noauth cmd line option
+ extern bool noauth;
+ extern AuthInfo internalSecurity;
+
+ /**
+ * This method checks the validity of filename as a security key, hashes its
+ * contents, and stores it in the internalSecurity variable. Prints an
+ * error message to the logs if there's an error.
+ * @param filename the file containing the key
+ * @return if the key was successfully stored
+ */
+ bool setUpSecurityKey(const string& filename);
+
+ class CmdAuthenticate : public Command {
+ public:
+ virtual bool requiresAuth() { return false; }
+ virtual bool logTheOp() {
+ return false;
+ }
+ virtual bool slaveOk() const {
+ return true;
+ }
+ virtual LockType locktype() const { return READ; }
+ virtual void help(stringstream& ss) const { ss << "internal"; }
+ CmdAuthenticate() : Command("authenticate") {}
+ bool run(const string& dbname , BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl);
+ void authenticate(const string& dbname, const string& user, const bool readOnly);
+ private:
+ bool getUserObj(const string& dbname, const string& user, BSONObj& userObj, string& pwd);
+ };
+
+ extern CmdAuthenticate cmdAuthenticate;
+
+ class CmdLogout : public Command {
+ public:
+ virtual bool logTheOp() {
+ return false;
+ }
+ virtual bool slaveOk() const {
+ return true;
+ }
+ void help(stringstream& h) const { h << "de-authenticate"; }
+ virtual LockType locktype() const { return NONE; }
+ CmdLogout() : Command("logout") {}
+ bool run(const string& dbname , BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl);
+ };
+
+} // namespace mongo
diff --git a/src/mongo/db/stats/counters.cpp b/src/mongo/db/stats/counters.cpp
new file mode 100644
index 00000000000..889e8a86c4c
--- /dev/null
+++ b/src/mongo/db/stats/counters.cpp
@@ -0,0 +1,207 @@
+// counters.cpp
+/*
+ * Copyright (C) 2010 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "pch.h"
+#include "../jsobj.h"
+#include "counters.h"
+
+namespace mongo {
+
+ OpCounters::OpCounters() {
+ int zero = 0;
+
+ BSONObjBuilder b;
+ b.append( "insert" , zero );
+ b.append( "query" , zero );
+ b.append( "update" , zero );
+ b.append( "delete" , zero );
+ b.append( "getmore" , zero );
+ b.append( "command" , zero );
+ _obj = b.obj();
+
+ _insert = (AtomicUInt*)_obj["insert"].value();
+ _query = (AtomicUInt*)_obj["query"].value();
+ _update = (AtomicUInt*)_obj["update"].value();
+ _delete = (AtomicUInt*)_obj["delete"].value();
+ _getmore = (AtomicUInt*)_obj["getmore"].value();
+ _command = (AtomicUInt*)_obj["command"].value();
+ }
+
+ void OpCounters::gotOp( int op , bool isCommand ) {
+ switch ( op ) {
+ case dbInsert: /*gotInsert();*/ break; // need to handle multi-insert
+ case dbQuery:
+ if ( isCommand )
+ gotCommand();
+ else
+ gotQuery();
+ break;
+
+ case dbUpdate: gotUpdate(); break;
+ case dbDelete: gotDelete(); break;
+ case dbGetMore: gotGetMore(); break;
+ case dbKillCursors:
+ case opReply:
+ case dbMsg:
+ break;
+ default: log() << "OpCounters::gotOp unknown op: " << op << endl;
+ }
+ }
+
+ BSONObj& OpCounters::getObj() {
+ const unsigned MAX = 1 << 30;
+ RARELY {
+ bool wrap =
+ _insert->get() > MAX ||
+ _query->get() > MAX ||
+ _update->get() > MAX ||
+ _delete->get() > MAX ||
+ _getmore->get() > MAX ||
+ _command->get() > MAX;
+
+ if ( wrap ) {
+ _insert->zero();
+ _query->zero();
+ _update->zero();
+ _delete->zero();
+ _getmore->zero();
+ _command->zero();
+ }
+
+ }
+ return _obj;
+ }
+
+ IndexCounters::IndexCounters() {
+ _memSupported = _pi.blockCheckSupported();
+
+ _btreeMemHits = 0;
+ _btreeMemMisses = 0;
+ _btreeAccesses = 0;
+
+
+ _maxAllowed = ( numeric_limits< long long >::max() ) / 2;
+ _resets = 0;
+
+ _sampling = 0;
+ _samplingrate = 100;
+ }
+
+ void IndexCounters::append( BSONObjBuilder& b ) {
+ if ( ! _memSupported ) {
+ b.append( "note" , "not supported on this platform" );
+ return;
+ }
+
+ BSONObjBuilder bb( b.subobjStart( "btree" ) );
+ bb.appendNumber( "accesses" , _btreeAccesses );
+ bb.appendNumber( "hits" , _btreeMemHits );
+ bb.appendNumber( "misses" , _btreeMemMisses );
+
+ bb.append( "resets" , _resets );
+
+ bb.append( "missRatio" , (_btreeAccesses ? (_btreeMemMisses / (double)_btreeAccesses) : 0) );
+
+ bb.done();
+
+ if ( _btreeAccesses > _maxAllowed ) {
+ _btreeAccesses = 0;
+ _btreeMemMisses = 0;
+ _btreeMemHits = 0;
+ _resets++;
+ }
+ }
+
+ FlushCounters::FlushCounters()
+ : _total_time(0)
+ , _flushes(0)
+ , _last()
+ {}
+
+ void FlushCounters::flushed(int ms) {
+ _flushes++;
+ _total_time += ms;
+ _last_time = ms;
+ _last = jsTime();
+ }
+
+ void FlushCounters::append( BSONObjBuilder& b ) {
+ b.appendNumber( "flushes" , _flushes );
+ b.appendNumber( "total_ms" , _total_time );
+ b.appendNumber( "average_ms" , (_flushes ? (_total_time / double(_flushes)) : 0.0) );
+ b.appendNumber( "last_ms" , _last_time );
+ b.append("last_finished", _last);
+ }
+
+
+ void GenericCounter::hit( const string& name , int count ) {
+ scoped_lock lk( _mutex );
+ _counts[name]++;
+ }
+
+ BSONObj GenericCounter::getObj() {
+ BSONObjBuilder b(128);
+ {
+ mongo::mutex::scoped_lock lk( _mutex );
+ for ( map<string,long long>::iterator i=_counts.begin(); i!=_counts.end(); i++ ) {
+ b.appendNumber( i->first , i->second );
+ }
+ }
+ return b.obj();
+ }
+
+
+ void NetworkCounter::hit( long long bytesIn , long long bytesOut ) {
+ const long long MAX = 1ULL << 60;
+
+ // don't care about the race as its just a counter
+ bool overflow = _bytesIn > MAX || _bytesOut > MAX;
+
+ if ( overflow ) {
+ _lock.lock();
+ _overflows++;
+ _bytesIn = bytesIn;
+ _bytesOut = bytesOut;
+ _requests = 1;
+ _lock.unlock();
+ }
+ else {
+ _lock.lock();
+ _bytesIn += bytesIn;
+ _bytesOut += bytesOut;
+ _requests++;
+ _lock.unlock();
+ }
+ }
+
+ void NetworkCounter::append( BSONObjBuilder& b ) {
+ _lock.lock();
+ b.appendNumber( "bytesIn" , _bytesIn );
+ b.appendNumber( "bytesOut" , _bytesOut );
+ b.appendNumber( "numRequests" , _requests );
+ _lock.unlock();
+ }
+
+
+ OpCounters globalOpCounters;
+ OpCounters replOpCounters;
+ IndexCounters globalIndexCounters;
+ FlushCounters globalFlushCounters;
+ NetworkCounter networkCounter;
+
+}
diff --git a/src/mongo/db/stats/counters.h b/src/mongo/db/stats/counters.h
new file mode 100644
index 00000000000..0cb29aa49aa
--- /dev/null
+++ b/src/mongo/db/stats/counters.h
@@ -0,0 +1,159 @@
+// counters.h
+/*
+ * Copyright (C) 2010 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../../pch.h"
+#include "../jsobj.h"
+#include "../../util/net/message.h"
+#include "../../util/processinfo.h"
+#include "../../util/concurrency/spin_lock.h"
+
+namespace mongo {
+
+ /**
+ * for storing operation counters
+ * note: not thread safe. ok with that for speed
+ */
+ class OpCounters {
+ public:
+
+ OpCounters();
+
+ AtomicUInt * getInsert() { return _insert; }
+ AtomicUInt * getQuery() { return _query; }
+ AtomicUInt * getUpdate() { return _update; }
+ AtomicUInt * getDelete() { return _delete; }
+ AtomicUInt * getGetMore() { return _getmore; }
+ AtomicUInt * getCommand() { return _command; }
+
+ void incInsertInWriteLock(int n) { _insert->x += n; }
+ void gotInsert() { _insert[0]++; }
+ void gotQuery() { _query[0]++; }
+ void gotUpdate() { _update[0]++; }
+ void gotDelete() { _delete[0]++; }
+ void gotGetMore() { _getmore[0]++; }
+ void gotCommand() { _command[0]++; }
+
+ void gotOp( int op , bool isCommand );
+
+ BSONObj& getObj();
+
+ private:
+ BSONObj _obj;
+
+ // todo: there will be a lot of cache line contention on these. need to do something
+ // else eventually.
+ AtomicUInt * _insert;
+ AtomicUInt * _query;
+ AtomicUInt * _update;
+ AtomicUInt * _delete;
+ AtomicUInt * _getmore;
+ AtomicUInt * _command;
+ };
+
+ extern OpCounters globalOpCounters;
+ extern OpCounters replOpCounters;
+
+
+ class IndexCounters {
+ public:
+ IndexCounters();
+
+ // used without a mutex intentionally (can race)
+ void btree( char * node ) {
+ if ( ! _memSupported )
+ return;
+ if ( _sampling++ % _samplingrate )
+ return;
+ btree( _pi.blockInMemory( node ) );
+ }
+
+ void btree( bool memHit ) {
+ if ( memHit )
+ _btreeMemHits++;
+ else
+ _btreeMemMisses++;
+ _btreeAccesses++;
+ }
+ void btreeHit() { _btreeMemHits++; _btreeAccesses++; }
+ void btreeMiss() { _btreeMemMisses++; _btreeAccesses++; }
+
+ void append( BSONObjBuilder& b );
+
+ private:
+ ProcessInfo _pi;
+ bool _memSupported;
+
+ int _sampling;
+ int _samplingrate;
+
+ int _resets;
+ long long _maxAllowed;
+
+ long long _btreeMemMisses;
+ long long _btreeMemHits;
+ long long _btreeAccesses;
+ };
+
+ extern IndexCounters globalIndexCounters;
+
+ class FlushCounters {
+ public:
+ FlushCounters();
+
+ void flushed(int ms);
+
+ void append( BSONObjBuilder& b );
+
+ private:
+ long long _total_time;
+ long long _flushes;
+ int _last_time;
+ Date_t _last;
+ };
+
+ extern FlushCounters globalFlushCounters;
+
+
+ class GenericCounter {
+ public:
+ GenericCounter() : _mutex("GenericCounter") { }
+ void hit( const string& name , int count=0 );
+ BSONObj getObj();
+ private:
+ map<string,long long> _counts; // TODO: replace with thread safe map
+ mongo::mutex _mutex;
+ };
+
+ class NetworkCounter {
+ public:
+ NetworkCounter() : _bytesIn(0), _bytesOut(0), _requests(0), _overflows(0) {}
+ void hit( long long bytesIn , long long bytesOut );
+ void append( BSONObjBuilder& b );
+ private:
+ long long _bytesIn;
+ long long _bytesOut;
+ long long _requests;
+
+ long long _overflows;
+
+ SpinLock _lock;
+ };
+
+ extern NetworkCounter networkCounter;
+}
diff --git a/src/mongo/db/stats/fine_clock.h b/src/mongo/db/stats/fine_clock.h
new file mode 100644
index 00000000000..02600e718c4
--- /dev/null
+++ b/src/mongo/db/stats/fine_clock.h
@@ -0,0 +1,67 @@
+// fine_clock.h
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef DB_STATS_FINE_CLOCK_HEADER
+#define DB_STATS_FINE_CLOCK_HEADER
+
+#include <time.h> // struct timespec
+
+namespace mongo {
+
+ /**
+ * This is a nano-second precision clock. We're skipping the
+ * harware TSC in favor of clock_gettime() which in some systems
+ * does not involve a trip to the OS (VDSO).
+ *
+ * We're exporting a type WallTime that is and should remain
+ * opaque. The business of getting accurate time is still ongoing
+ * and we may change the internal representation of this class.
+ * (http://lwn.net/Articles/388188/)
+ *
+ * Really, you shouldn't be using this class in hot code paths for
+ * platforms you're not sure whether the overhead is low.
+ */
+ class FineClock {
+ public:
+
+ typedef timespec WallTime;
+
+ static WallTime now() {
+ struct timespec ts;
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ return ts;
+ }
+
+ static uint64_t diffInNanos( WallTime end, WallTime start ) {
+ uint64_t diff;
+ if ( end.tv_nsec < start.tv_nsec ) {
+ diff = 1000000000 * ( end.tv_sec - start.tv_sec - 1);
+ diff += 1000000000 + end.tv_nsec - start.tv_nsec;
+ }
+ else {
+ diff = 1000000000 * ( end.tv_sec - start.tv_sec );
+ diff += end.tv_nsec - start.tv_nsec;
+ }
+ return diff;
+ }
+
+ };
+}
+
+#endif // DB_STATS_FINE_CLOCK_HEADER
+
diff --git a/src/mongo/db/stats/service_stats.cpp b/src/mongo/db/stats/service_stats.cpp
new file mode 100644
index 00000000000..d69147fe969
--- /dev/null
+++ b/src/mongo/db/stats/service_stats.cpp
@@ -0,0 +1,68 @@
+// service_stats.cpp
+
+/**
+* Copyright (C) 2010 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <sstream>
+
+#include "../../util/histogram.h"
+#include "service_stats.h"
+
+namespace mongo {
+
+ using std::ostringstream;
+
+ ServiceStats::ServiceStats() {
+ // Time histogram covers up to 128msec in exponential intervals
+ // starting at 125usec.
+ Histogram::Options timeOpts;
+ timeOpts.numBuckets = 12;
+ timeOpts.bucketSize = 125;
+ timeOpts.exponential = true;
+ _timeHistogram = new Histogram( timeOpts );
+
+ // Space histogram covers up to 1MB in exponentialintervals starting
+ // at 1K.
+ Histogram::Options spaceOpts;
+ spaceOpts.numBuckets = 12;
+ spaceOpts.bucketSize = 1024;
+ spaceOpts.exponential = true;
+ _spaceHistogram = new Histogram( spaceOpts );
+ }
+
+ ServiceStats::~ServiceStats() {
+ delete _timeHistogram;
+ delete _spaceHistogram;
+ }
+
+ void ServiceStats::logResponse( uint64_t duration, uint64_t bytes ) {
+ _spinLock.lock();
+ _timeHistogram->insert( duration / 1000 /* in usecs */ );
+ _spaceHistogram->insert( bytes );
+ _spinLock.unlock();
+ }
+
+ string ServiceStats::toHTML() const {
+ ostringstream res ;
+ res << "Cumulative wire stats\n"
+ << "Response times\n" << _timeHistogram->toHTML()
+ << "Response sizes\n" << _spaceHistogram->toHTML()
+ << '\n';
+
+ return res.str();
+ }
+
+} // mongo
diff --git a/src/mongo/db/stats/service_stats.h b/src/mongo/db/stats/service_stats.h
new file mode 100644
index 00000000000..5b0e75fdcb9
--- /dev/null
+++ b/src/mongo/db/stats/service_stats.h
@@ -0,0 +1,66 @@
+// service_stats.h
+
+/**
+* Copyright (C) 2010 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef DB_STATS_SERVICE_STATS_HEADER
+#define DB_STATS_SERVICE_STATS_HEADER
+
+#include <string>
+
+#include "../../util/concurrency/spin_lock.h"
+
+namespace mongo {
+
+ using std::string;
+
+ class Histogram;
+
+ /**
+ * ServiceStats keeps track of the time a request/response message
+ * took inside a service as well as the size of the response
+ * generated.
+ */
+ class ServiceStats {
+ public:
+ ServiceStats();
+ ~ServiceStats();
+
+ /**
+ * Record the 'duration' in microseconds a request/response
+ * message took and the size in bytes of the generated
+ * response.
+ */
+ void logResponse( uint64_t duration, uint64_t bytes );
+
+ /**
+ * Render the histogram as string that can be used inside an
+ * HTML doc.
+ */
+ string toHTML() const;
+
+ private:
+ SpinLock _spinLock; // protects state below
+ Histogram* _timeHistogram;
+ Histogram* _spaceHistogram;
+
+ ServiceStats( const ServiceStats& );
+ ServiceStats operator=( const ServiceStats& );
+ };
+
+} // namespace mongo
+
+#endif // DB_STATS_SERVICE_STATS_HEADER
diff --git a/src/mongo/db/stats/snapshots.cpp b/src/mongo/db/stats/snapshots.cpp
new file mode 100644
index 00000000000..900cc4ff1ad
--- /dev/null
+++ b/src/mongo/db/stats/snapshots.cpp
@@ -0,0 +1,227 @@
+// snapshots.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "snapshots.h"
+#include "../client.h"
+#include "../clientcursor.h"
+#include "../dbwebserver.h"
+#include "../../util/mongoutils/html.h"
+
+/**
+ handles snapshotting performance metrics and other such things
+ */
+namespace mongo {
+
+ void SnapshotData::takeSnapshot() {
+ _created = curTimeMicros64();
+ _globalUsage = Top::global.getGlobalData();
+// _totalWriteLockedTime = d.dbMutex.info().getTimeLocked();
+ Top::global.cloneMap(_usage);
+ }
+
+ SnapshotDelta::SnapshotDelta( const SnapshotData& older , const SnapshotData& newer )
+ : _older( older ) , _newer( newer ) {
+ assert( _newer._created > _older._created );
+ _elapsed = _newer._created - _older._created;
+ }
+
+ Top::CollectionData SnapshotDelta::globalUsageDiff() {
+ return Top::CollectionData( _older._globalUsage , _newer._globalUsage );
+ }
+ Top::UsageMap SnapshotDelta::collectionUsageDiff() {
+ assert( _newer._created > _older._created );
+ Top::UsageMap u;
+
+ for ( Top::UsageMap::const_iterator i=_newer._usage.begin(); i != _newer._usage.end(); i++ ) {
+ Top::UsageMap::const_iterator j = _older._usage.find(i->first);
+ if (j != _older._usage.end())
+ u[i->first] = Top::CollectionData( j->second , i->second );
+ else
+ u[i->first] = i->second;
+ }
+ return u;
+ }
+
+ Snapshots::Snapshots(int n)
+ : _lock("Snapshots"), _n(n)
+ , _snapshots(new SnapshotData[n])
+ , _loc(0)
+ , _stored(0)
+ {}
+
+ const SnapshotData* Snapshots::takeSnapshot() {
+ scoped_lock lk(_lock);
+ _loc = ( _loc + 1 ) % _n;
+ _snapshots[_loc].takeSnapshot();
+ if ( _stored < _n )
+ _stored++;
+ return &_snapshots[_loc];
+ }
+
+ auto_ptr<SnapshotDelta> Snapshots::computeDelta( int numBack ) {
+ scoped_lock lk(_lock);
+ auto_ptr<SnapshotDelta> p;
+ if ( numBack < numDeltas() )
+ p.reset( new SnapshotDelta( getPrev(numBack+1) , getPrev(numBack) ) );
+ return p;
+ }
+
+ const SnapshotData& Snapshots::getPrev( int numBack ) {
+ int x = _loc - numBack;
+ if ( x < 0 )
+ x += _n;
+ return _snapshots[x];
+ }
+
+ void Snapshots::outputLockInfoHTML( stringstream& ss ) {
+ scoped_lock lk(_lock);
+ ss << "\n<div>";
+ for ( int i=0; i<numDeltas(); i++ ) {
+ SnapshotDelta d( getPrev(i+1) , getPrev(i) );
+ unsigned e = (unsigned) d.elapsed() / 1000;
+ ss << (unsigned)(100*d.percentWriteLocked());
+ if( e < 3900 || e > 4100 )
+ ss << '(' << e / 1000.0 << "s)";
+ ss << ' ';
+ }
+ ss << "</div>\n";
+ }
+
+ void SnapshotThread::run() {
+ Client::initThread("snapshotthread");
+ Client& client = cc();
+
+ long long numLoops = 0;
+
+ const SnapshotData* prev = 0;
+
+ while ( ! inShutdown() ) {
+ try {
+ const SnapshotData* s = statsSnapshots.takeSnapshot();
+
+ if ( prev && cmdLine.cpu ) {
+ unsigned long long elapsed = s->_created - prev->_created;
+ SnapshotDelta d( *prev , *s );
+ log() << "cpu: elapsed:" << (elapsed/1000) <<" writelock: " << (int)(100*d.percentWriteLocked()) << "%" << endl;
+ }
+
+ prev = s;
+ }
+ catch ( std::exception& e ) {
+ log() << "ERROR in SnapshotThread: " << e.what() << endl;
+ }
+
+ numLoops++;
+ sleepsecs(4);
+ }
+
+ client.shutdown();
+ }
+
+ using namespace mongoutils::html;
+
+ class WriteLockStatus : public WebStatusPlugin {
+ public:
+ WriteLockStatus() : WebStatusPlugin( "write lock" , 51 , "% time in write lock, by 4 sec periods" ) {}
+ virtual void init() {}
+
+ virtual void run( stringstream& ss ) {
+ statsSnapshots.outputLockInfoHTML( ss );
+
+ ss << "<a "
+ "href=\"http://www.mongodb.org/pages/viewpage.action?pageId=7209296\" "
+ "title=\"snapshot: was the db in the write lock when this page was generated?\">";
+ ss << "write locked now:</a> " << (d.dbMutex.info().isLocked() ? "true" : "false") << "\n";
+ }
+
+ } writeLockStatus;
+
+ class DBTopStatus : public WebStatusPlugin {
+ public:
+ DBTopStatus() : WebStatusPlugin( "dbtop" , 50 , "(occurrences|percent of elapsed)" ) {}
+
+ void display( stringstream& ss , double elapsed , const Top::UsageData& usage ) {
+ ss << "<td>";
+ ss << usage.count;
+ ss << "</td><td>";
+ double per = 100 * ((double)usage.time)/elapsed;
+ if( per == (int) per )
+ ss << (int) per;
+ else
+ ss << setprecision(1) << fixed << per;
+ ss << '%';
+ ss << "</td>";
+ }
+
+ void display( stringstream& ss , double elapsed , const string& ns , const Top::CollectionData& data ) {
+ if ( ns != "TOTAL" && data.total.count == 0 )
+ return;
+ ss << "<tr><th>" << ns << "</th>";
+
+ display( ss , elapsed , data.total );
+
+ display( ss , elapsed , data.readLock );
+ display( ss , elapsed , data.writeLock );
+
+ display( ss , elapsed , data.queries );
+ display( ss , elapsed , data.getmore );
+ display( ss , elapsed , data.insert );
+ display( ss , elapsed , data.update );
+ display( ss , elapsed , data.remove );
+
+ ss << "</tr>\n";
+ }
+
+ void run( stringstream& ss ) {
+ auto_ptr<SnapshotDelta> delta = statsSnapshots.computeDelta();
+ if ( ! delta.get() )
+ return;
+
+ ss << "<table border=1 cellpadding=2 cellspacing=0>";
+ ss << "<tr align='left'><th>";
+ ss << a("http://www.mongodb.org/display/DOCS/Developer+FAQ#DeveloperFAQ-What%27sa%22namespace%22%3F", "namespace") <<
+ "NS</a></th>"
+ "<th colspan=2>total</th>"
+ "<th colspan=2>Reads</th>"
+ "<th colspan=2>Writes</th>"
+ "<th colspan=2>Queries</th>"
+ "<th colspan=2>GetMores</th>"
+ "<th colspan=2>Inserts</th>"
+ "<th colspan=2>Updates</th>"
+ "<th colspan=2>Removes</th>";
+ ss << "</tr>\n";
+
+ display( ss , (double) delta->elapsed() , "TOTAL" , delta->globalUsageDiff() );
+
+ Top::UsageMap usage = delta->collectionUsageDiff();
+ for ( Top::UsageMap::iterator i=usage.begin(); i != usage.end(); i++ ) {
+ display( ss , (double) delta->elapsed() , i->first , i->second );
+ }
+
+ ss << "</table>";
+
+ }
+
+ virtual void init() {}
+ } dbtopStatus;
+
+ Snapshots statsSnapshots;
+ SnapshotThread snapshotThread;
+
+}
diff --git a/src/mongo/db/stats/snapshots.h b/src/mongo/db/stats/snapshots.h
new file mode 100644
index 00000000000..d9b8e5eb901
--- /dev/null
+++ b/src/mongo/db/stats/snapshots.h
@@ -0,0 +1,114 @@
+// snapshots.h
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+#include "../../pch.h"
+#include "../jsobj.h"
+#include "top.h"
+#include "../../util/background.h"
+
+/**
+ handles snapshotting performance metrics and other such things
+ */
+namespace mongo {
+
+ class SnapshotThread;
+
+ /**
+ * stores a point in time snapshot
+ * i.e. all counters at a given time
+ */
+ class SnapshotData {
+ void takeSnapshot();
+
+ unsigned long long _created;
+ Top::CollectionData _globalUsage;
+ unsigned long long _totalWriteLockedTime; // micros of total time locked
+ Top::UsageMap _usage;
+
+ friend class SnapshotThread;
+ friend class SnapshotDelta;
+ friend class Snapshots;
+ };
+
+ /**
+ * contains performance information for a time period
+ */
+ class SnapshotDelta {
+ public:
+ SnapshotDelta( const SnapshotData& older , const SnapshotData& newer );
+
+ unsigned long long start() const {
+ return _older._created;
+ }
+
+ unsigned long long elapsed() const {
+ return _elapsed;
+ }
+
+ unsigned long long timeInWriteLock() const {
+ return _newer._totalWriteLockedTime - _older._totalWriteLockedTime;
+ }
+ double percentWriteLocked() const {
+ double e = (double) elapsed();
+ double w = (double) timeInWriteLock();
+ return w/e;
+ }
+
+ Top::CollectionData globalUsageDiff();
+ Top::UsageMap collectionUsageDiff();
+
+ private:
+ const SnapshotData& _older;
+ const SnapshotData& _newer;
+
+ unsigned long long _elapsed;
+ };
+
+ class Snapshots {
+ public:
+ Snapshots(int n=100);
+
+ const SnapshotData* takeSnapshot();
+
+ int numDeltas() const { return _stored-1; }
+
+ const SnapshotData& getPrev( int numBack = 0 );
+ auto_ptr<SnapshotDelta> computeDelta( int numBack = 0 );
+
+
+ void outputLockInfoHTML( stringstream& ss );
+ private:
+ mongo::mutex _lock;
+ int _n;
+ boost::scoped_array<SnapshotData> _snapshots;
+ int _loc;
+ int _stored;
+ };
+
+ class SnapshotThread : public BackgroundJob {
+ public:
+ virtual string name() const { return "snapshot"; }
+ void run();
+ };
+
+ extern Snapshots statsSnapshots;
+ extern SnapshotThread snapshotThread;
+
+
+}
diff --git a/src/mongo/db/stats/top.cpp b/src/mongo/db/stats/top.cpp
new file mode 100644
index 00000000000..f5b6ee42f1c
--- /dev/null
+++ b/src/mongo/db/stats/top.cpp
@@ -0,0 +1,183 @@
+// top.cpp
+/*
+ * Copyright (C) 2010 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "pch.h"
+#include "top.h"
+#include "../../util/net/message.h"
+#include "../commands.h"
+
+namespace mongo {
+
+ Top::UsageData::UsageData( const UsageData& older , const UsageData& newer ) {
+ // this won't be 100% accurate on rollovers and drop(), but at least it won't be negative
+ time = (newer.time >= older.time) ? (newer.time - older.time) : newer.time;
+ count = (newer.count >= older.count) ? (newer.count - older.count) : newer.count;
+ }
+
+ Top::CollectionData::CollectionData( const CollectionData& older , const CollectionData& newer )
+ : total( older.total , newer.total ) ,
+ readLock( older.readLock , newer.readLock ) ,
+ writeLock( older.writeLock , newer.writeLock ) ,
+ queries( older.queries , newer.queries ) ,
+ getmore( older.getmore , newer.getmore ) ,
+ insert( older.insert , newer.insert ) ,
+ update( older.update , newer.update ) ,
+ remove( older.remove , newer.remove ),
+ commands( older.commands , newer.commands ) {
+
+ }
+
+ void Top::record( const string& ns , int op , int lockType , long long micros , bool command ) {
+ if ( ns[0] == '?' )
+ return;
+
+ //cout << "record: " << ns << "\t" << op << "\t" << command << endl;
+ scoped_lock lk(_lock);
+
+ if ( ( command || op == dbQuery ) && ns == _lastDropped ) {
+ _lastDropped = "";
+ return;
+ }
+
+ CollectionData& coll = _usage[ns];
+ _record( coll , op , lockType , micros , command );
+ _record( _global , op , lockType , micros , command );
+ }
+
+ void Top::_record( CollectionData& c , int op , int lockType , long long micros , bool command ) {
+ c.total.inc( micros );
+
+ if ( lockType > 0 )
+ c.writeLock.inc( micros );
+ else if ( lockType < 0 )
+ c.readLock.inc( micros );
+
+ switch ( op ) {
+ case 0:
+ // use 0 for unknown, non-specific
+ break;
+ case dbUpdate:
+ c.update.inc( micros );
+ break;
+ case dbInsert:
+ c.insert.inc( micros );
+ break;
+ case dbQuery:
+ if ( command )
+ c.commands.inc( micros );
+ else
+ c.queries.inc( micros );
+ break;
+ case dbGetMore:
+ c.getmore.inc( micros );
+ break;
+ case dbDelete:
+ c.remove.inc( micros );
+ break;
+ case dbKillCursors:
+ break;
+ case opReply:
+ case dbMsg:
+ log() << "unexpected op in Top::record: " << op << endl;
+ break;
+ default:
+ log() << "unknown op in Top::record: " << op << endl;
+ }
+
+ }
+
+ void Top::collectionDropped( const string& ns ) {
+ //cout << "collectionDropped: " << ns << endl;
+ scoped_lock lk(_lock);
+ _usage.erase(ns);
+ _lastDropped = ns;
+ }
+
+ void Top::cloneMap(Top::UsageMap& out) const {
+ scoped_lock lk(_lock);
+ out = _usage;
+ }
+
+ void Top::append( BSONObjBuilder& b ) {
+ scoped_lock lk( _lock );
+ _appendToUsageMap( b , _usage );
+ }
+
+ void Top::_appendToUsageMap( BSONObjBuilder& b , const UsageMap& map ) const {
+ for ( UsageMap::const_iterator i=map.begin(); i!=map.end(); i++ ) {
+ BSONObjBuilder bb( b.subobjStart( i->first ) );
+
+ const CollectionData& coll = i->second;
+
+ _appendStatsEntry( b , "total" , coll.total );
+
+ _appendStatsEntry( b , "readLock" , coll.readLock );
+ _appendStatsEntry( b , "writeLock" , coll.writeLock );
+
+ _appendStatsEntry( b , "queries" , coll.queries );
+ _appendStatsEntry( b , "getmore" , coll.getmore );
+ _appendStatsEntry( b , "insert" , coll.insert );
+ _appendStatsEntry( b , "update" , coll.update );
+ _appendStatsEntry( b , "remove" , coll.remove );
+ _appendStatsEntry( b , "commands" , coll.commands );
+
+ bb.done();
+ }
+ }
+
+ void Top::_appendStatsEntry( BSONObjBuilder& b , const char * statsName , const UsageData& map ) const {
+ BSONObjBuilder bb( b.subobjStart( statsName ) );
+ bb.appendNumber( "time" , map.time );
+ bb.appendNumber( "count" , map.count );
+ bb.done();
+ }
+
+ class TopCmd : public Command {
+ public:
+ TopCmd() : Command( "top", true ) {}
+
+ virtual bool slaveOk() const { return true; }
+ virtual bool adminOnly() const { return true; }
+ virtual LockType locktype() const { return READ; }
+ virtual void help( stringstream& help ) const { help << "usage by collection, in micros "; }
+
+ virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+ {
+ BSONObjBuilder b( result.subobjStart( "totals" ) );
+ b.append( "note" , "all times in microseconds" );
+ Top::global.append( b );
+ b.done();
+ }
+ return true;
+ }
+
+ } topCmd;
+
+ Top Top::global;
+
+ TopOld::T TopOld::_snapshotStart = TopOld::currentTime();
+ TopOld::D TopOld::_snapshotDuration;
+ TopOld::UsageMap TopOld::_totalUsage;
+ TopOld::UsageMap TopOld::_snapshotA;
+ TopOld::UsageMap TopOld::_snapshotB;
+ TopOld::UsageMap &TopOld::_snapshot = TopOld::_snapshotA;
+ TopOld::UsageMap &TopOld::_nextSnapshot = TopOld::_snapshotB;
+ mongo::mutex TopOld::topMutex("topMutex");
+
+
+}
diff --git a/src/mongo/db/stats/top.h b/src/mongo/db/stats/top.h
new file mode 100644
index 00000000000..9645ed1a3a6
--- /dev/null
+++ b/src/mongo/db/stats/top.h
@@ -0,0 +1,247 @@
+// top.h : DB usage monitor.
+
+/* Copyright 2009 10gen Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <boost/date_time/posix_time/posix_time.hpp>
+#undef assert
+#define assert MONGO_assert
+
+namespace mongo {
+
+ /**
+ * tracks usage by collection
+ */
+ class Top {
+
+ public:
+ Top() : _lock("Top") { }
+
+ struct UsageData {
+ UsageData() : time(0) , count(0) {}
+ UsageData( const UsageData& older , const UsageData& newer );
+ long long time;
+ long long count;
+
+ void inc( long long micros ) {
+ count++;
+ time += micros;
+ }
+ };
+
+ struct CollectionData {
+ /**
+ * constructs a diff
+ */
+ CollectionData() {}
+ CollectionData( const CollectionData& older , const CollectionData& newer );
+
+ UsageData total;
+
+ UsageData readLock;
+ UsageData writeLock;
+
+ UsageData queries;
+ UsageData getmore;
+ UsageData insert;
+ UsageData update;
+ UsageData remove;
+ UsageData commands;
+ };
+
+ typedef map<string,CollectionData> UsageMap;
+
+ public:
+ void record( const string& ns , int op , int lockType , long long micros , bool command );
+ void append( BSONObjBuilder& b );
+ void cloneMap(UsageMap& out) const;
+ CollectionData getGlobalData() const { return _global; }
+ void collectionDropped( const string& ns );
+
+ public: // static stuff
+ static Top global;
+
+ private:
+ void _appendToUsageMap( BSONObjBuilder& b , const UsageMap& map ) const;
+ void _appendStatsEntry( BSONObjBuilder& b , const char * statsName , const UsageData& map ) const;
+ void _record( CollectionData& c , int op , int lockType , long long micros , bool command );
+
+ mutable mongo::mutex _lock;
+ CollectionData _global;
+ UsageMap _usage;
+ string _lastDropped;
+ };
+
+ /* Records per namespace utilization of the mongod process.
+ No two functions of this class may be called concurrently.
+ */
+ class TopOld {
+ typedef boost::posix_time::ptime T;
+ typedef boost::posix_time::time_duration D;
+ typedef boost::tuple< D, int, int, int > UsageData;
+ public:
+ TopOld() : _read(false), _write(false) { }
+
+ /* these are used to record activity: */
+
+ void clientStart( const char *client ) {
+ clientStop();
+ _currentStart = currentTime();
+ _current = client;
+ }
+
+ /* indicate current request is a read operation. */
+ void setRead() { _read = true; }
+
+ void setWrite() { _write = true; }
+
+ void clientStop() {
+ if ( _currentStart == T() )
+ return;
+ D d = currentTime() - _currentStart;
+
+ {
+ scoped_lock L(topMutex);
+ recordUsage( _current, d );
+ }
+
+ _currentStart = T();
+ _read = false;
+ _write = false;
+ }
+
+ /* these are used to fetch the stats: */
+
+ struct Usage {
+ string ns;
+ D time;
+ double pct;
+ int reads, writes, calls;
+ };
+
+ static void usage( vector< Usage > &res ) {
+ scoped_lock L(topMutex);
+
+ // Populate parent namespaces
+ UsageMap snapshot;
+ UsageMap totalUsage;
+ fillParentNamespaces( snapshot, _snapshot );
+ fillParentNamespaces( totalUsage, _totalUsage );
+
+ multimap< D, string, more > sorted;
+ for( UsageMap::iterator i = snapshot.begin(); i != snapshot.end(); ++i )
+ sorted.insert( make_pair( i->second.get<0>(), i->first ) );
+ for( multimap< D, string, more >::iterator i = sorted.begin(); i != sorted.end(); ++i ) {
+ if ( trivialNs( i->second.c_str() ) )
+ continue;
+ Usage u;
+ u.ns = i->second;
+ u.time = totalUsage[ u.ns ].get<0>();
+ u.pct = _snapshotDuration != D() ? 100.0 * i->first.ticks() / _snapshotDuration.ticks() : 0;
+ u.reads = snapshot[ u.ns ].get<1>();
+ u.writes = snapshot[ u.ns ].get<2>();
+ u.calls = snapshot[ u.ns ].get<3>();
+ res.push_back( u );
+ }
+ for( UsageMap::iterator i = totalUsage.begin(); i != totalUsage.end(); ++i ) {
+ if ( snapshot.count( i->first ) != 0 || trivialNs( i->first.c_str() ) )
+ continue;
+ Usage u;
+ u.ns = i->first;
+ u.time = i->second.get<0>();
+ u.pct = 0;
+ u.reads = 0;
+ u.writes = 0;
+ u.calls = 0;
+ res.push_back( u );
+ }
+ }
+
+ static void completeSnapshot() {
+ scoped_lock L(topMutex);
+
+ if ( &_snapshot == &_snapshotA ) {
+ _snapshot = _snapshotB;
+ _nextSnapshot = _snapshotA;
+ }
+ else {
+ _snapshot = _snapshotA;
+ _nextSnapshot = _snapshotB;
+ }
+ _snapshotDuration = currentTime() - _snapshotStart;
+ _snapshotStart = currentTime();
+ _nextSnapshot.clear();
+ }
+
+ private:
+ static mongo::mutex topMutex;
+ static bool trivialNs( const char *ns ) {
+ const char *ret = strrchr( ns, '.' );
+ return ret && ret[ 1 ] == '\0';
+ }
+ typedef map<string,UsageData> UsageMap; // duration, # reads, # writes, # total calls
+ static T currentTime() {
+ return boost::posix_time::microsec_clock::universal_time();
+ }
+ void recordUsage( const string &client, D duration ) {
+ recordUsageForMap( _totalUsage, client, duration );
+ recordUsageForMap( _nextSnapshot, client, duration );
+ }
+ void recordUsageForMap( UsageMap &map, const string &client, D duration ) {
+ UsageData& g = map[client];
+ g.get< 0 >() += duration;
+ if ( _read && !_write )
+ g.get< 1 >()++;
+ else if ( !_read && _write )
+ g.get< 2 >()++;
+ g.get< 3 >()++;
+ }
+ static void fillParentNamespaces( UsageMap &to, const UsageMap &from ) {
+ for( UsageMap::const_iterator i = from.begin(); i != from.end(); ++i ) {
+ string current = i->first;
+ size_t dot = current.rfind( "." );
+ if ( dot == string::npos || dot != current.length() - 1 ) {
+ inc( to[ current ], i->second );
+ }
+ while( dot != string::npos ) {
+ current = current.substr( 0, dot );
+ inc( to[ current ], i->second );
+ dot = current.rfind( "." );
+ }
+ }
+ }
+ static void inc( UsageData &to, const UsageData &from ) {
+ to.get<0>() += from.get<0>();
+ to.get<1>() += from.get<1>();
+ to.get<2>() += from.get<2>();
+ to.get<3>() += from.get<3>();
+ }
+ struct more { bool operator()( const D &a, const D &b ) { return a > b; } };
+ string _current;
+ T _currentStart;
+ static T _snapshotStart;
+ static D _snapshotDuration;
+ static UsageMap _totalUsage;
+ static UsageMap _snapshotA;
+ static UsageMap _snapshotB;
+ static UsageMap &_snapshot;
+ static UsageMap &_nextSnapshot;
+ bool _read;
+ bool _write;
+ };
+
+} // namespace mongo
diff --git a/src/mongo/db/taskqueue.h b/src/mongo/db/taskqueue.h
new file mode 100644
index 00000000000..005bd986f11
--- /dev/null
+++ b/src/mongo/db/taskqueue.h
@@ -0,0 +1,106 @@
+// @file deferredinvoker.h
+
+/**
+ * Copyright (C) 2008 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "mongomutex.h"
+
+namespace mongo {
+
+ /** defer work items by queueing them for invocation by another thread. presumption is that
+ consumer thread is outside of locks more than the source thread. Additional presumption
+ is that several objects or micro-tasks will be queued and that having a single thread
+ processing them in batch is hepful as they (in the first use case) use a common data
+ structure that can then be in local cpu classes.
+
+ this class is in db/ as it is dbMutex (mongomutex) specific (so far).
+
+ using a functor instead of go() might be more elegant too, once again, would like to test any
+ performance differential. also worry that operator() hides things?
+
+ MT - copyable "micro task" object we can queue
+ must have a static method void MT::go(const MT&)
+
+ see DefInvoke in dbtests/ for an example.
+ */
+ template< class MT >
+ class TaskQueue {
+ public:
+ TaskQueue() : _which(0), _invokeMutex("deferredinvoker") { }
+
+ void defer(MT mt) {
+ // only one writer allowed. however the invoke processing below can occur concurrently with
+ // writes (for the most part)
+ DEV d.dbMutex.assertWriteLocked();
+
+ _queues[_which].push_back(mt);
+ }
+
+ /** call to process deferrals.
+
+ concurrency: handled herein. multiple threads could call invoke(), but their efforts will be
+ serialized. the common case is that there is a single processor calling invoke().
+
+ normally, you call this outside of any lock. but if you want to fully drain the queue,
+ call from within a read lock. for example:
+ {
+ // drain with minimal time in lock
+ d.invoke();
+ readlock lk;
+ d.invoke();
+ ...
+ }
+ you can also call invoke periodically to do some work and then pick up later on more.
+ */
+ void invoke() {
+ mutex::scoped_lock lk2(_invokeMutex);
+ int toDrain = 0;
+ {
+ // flip queueing to the other queue (we are double buffered)
+ readlocktry lk("", 5);
+ if( !lk.got() )
+ return;
+ toDrain = _which;
+ _which = _which ^ 1;
+ wassert( _queues[_which].empty() ); // we are in dbMutex, so it should be/stay empty til we exit dbMutex
+ }
+
+ _drain( _queues[toDrain] );
+ assert( _queues[toDrain].empty() );
+ }
+
+ private:
+ int _which; // 0 or 1
+ typedef vector< MT > Queue;
+ Queue _queues[2];
+
+ // lock order when multiple locks: dbMutex, _invokeMutex
+ mongo::mutex _invokeMutex;
+
+ void _drain(Queue& queue) {
+ unsigned oldCap = queue.capacity();
+ for( typename Queue::iterator i = queue.begin(); i != queue.end(); i++ ) {
+ const MT& v = *i;
+ MT::go(v);
+ }
+ queue.clear();
+ DEV assert( queue.capacity() == oldCap ); // just checking that clear() doesn't deallocate, we don't want that
+ }
+ };
+
+}
diff --git a/src/mongo/db/tests.cpp b/src/mongo/db/tests.cpp
new file mode 100644
index 00000000000..00f299e1bb6
--- /dev/null
+++ b/src/mongo/db/tests.cpp
@@ -0,0 +1,68 @@
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* tests.cpp
+
+ unit test & such
+*/
+
+#include "pch.h"
+#include "../util/mmap.h"
+
+namespace mongo {
+
+ int test2_old9() {
+ out() << "test2" << endl;
+ printStackTrace();
+ if ( 1 )
+ return 1;
+
+ MemoryMappedFile f;
+
+ unsigned long long len = 64*1024*1024;
+ char *p = (char *) f.map("/tmp/test.dat", len);
+ char *start = p;
+ char *end = p + 64*1024*1024-2;
+ end[1] = 'z';
+ int i;
+ while ( p < end ) {
+ *p++ = ' ';
+ if ( ++i%64 == 0 ) {
+ *p++ = '\n';
+ *p++ = 'x';
+ }
+ }
+ *p = 'a';
+
+ f.flush(true);
+ out() << "done" << endl;
+
+ char *x = start + 32 * 1024 * 1024;
+ char *y = start + 48 * 1024 * 1024;
+ char *z = start + 62 * 1024 * 1024;
+
+ strcpy(z, "zfoo");
+ out() << "y" << endl;
+ strcpy(y, "yfoo");
+ strcpy(x, "xfoo");
+ strcpy(start, "xfoo");
+
+ dbexit( EXIT_TEST );
+
+ return 1;
+ }
+
+} // namespace mongo