From 34f8e800ce518a3cedee2f2a61cafc7e919e0bb1 Mon Sep 17 00:00:00 2001 From: agirbal Date: Tue, 25 Jan 2011 14:41:16 -0800 Subject: [CACHE-2398]: for inline mapreduce, all emitted objects are kept in RAM before the 1st reduce, potential high memory usage --- db/commands/mr.cpp | 29 ++++++++++++++++------------- db/commands/mr.h | 3 ++- 2 files changed, 18 insertions(+), 14 deletions(-) (limited to 'db') diff --git a/db/commands/mr.cpp b/db/commands/mr.cpp index 532c430a5a7..5fbe2434f1e 100644 --- a/db/commands/mr.cpp +++ b/db/commands/mr.cpp @@ -652,6 +652,7 @@ namespace mongo { InMemory * n = new InMemory(); // for new data long nSize = 0; + long dupCount = 0; for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); ++i ) { BSONObj key = i->first; @@ -667,19 +668,20 @@ namespace mongo { } else { // add to new map - _add( n , all[0] , nSize ); + _add( n , all[0] , nSize, dupCount ); } } else if ( all.size() > 1 ) { // several values, reduce and add to map BSONObj res = _config.reducer->reduce( all ); - _add( n , res , nSize ); + _add( n , res , nSize, dupCount ); } } // swap maps _temp.reset( n ); _size = nSize; + _dupCount = dupCount; } /** @@ -710,31 +712,32 @@ namespace mongo { */ void State::emit( const BSONObj& a ) { _numEmits++; - _add( _temp.get() , a , _size ); + _add( _temp.get() , a , _size, _dupCount ); } - void State::_add( InMemory* im, const BSONObj& a , long& size ) { + void State::_add( InMemory* im, const BSONObj& a , long& size, long& dupCount ) { BSONList& all = (*im)[a]; all.push_back( a ); size += a.objsize() + 16; + if (all.size() > 1) + ++dupCount; } /** * this method checks the size of in memory map and potentially flushes to disk */ void State::checkSize() { - if ( ! _onDisk ) - return; - - // the limits to flush to disk are rather low, a few KB, may need to increase - if ( _size < 1024 * 5 ) + if ( _size < 1024 * 50 ) return; - long before = _size; - reduceInMemory(); - log(1) << " mr: did reduceInMemory " << before << " -->> " << _size << endl; + // attempt to reduce in memory map, if we've seen duplicates + if ( _dupCount > 0) { + long before = _size; + reduceInMemory(); + log(1) << " mr: did reduceInMemory " << before << " -->> " << _size << endl; + } - if ( _size < 1024 * 15 ) + if ( ! _onDisk || _size < 1024 * 100 ) return; dumpToInc(); diff --git a/db/commands/mr.h b/db/commands/mr.h index f8ec495ecff..2f3520230f2 100644 --- a/db/commands/mr.h +++ b/db/commands/mr.h @@ -268,7 +268,7 @@ namespace mongo { protected: void _insertToInc( BSONObj& o ); - static void _add( InMemory* im , const BSONObj& a , long& size ); + static void _add( InMemory* im , const BSONObj& a , long& size, long& dupCount ); scoped_ptr _scope; const Config& _config; @@ -278,6 +278,7 @@ namespace mongo { scoped_ptr _temp; long _size; // bytes in _temp + long _dupCount; // number of duplicate key entries long long _numEmits; }; -- cgit v1.2.1