diff options
author | Eliot Horowitz <eliot@10gen.com> | 2009-10-09 14:55:29 -0400 |
---|---|---|
committer | Eliot Horowitz <eliot@10gen.com> | 2009-10-09 14:55:29 -0400 |
commit | 0496aa5ac7e1a99b082f5375c38da9871d08847c (patch) | |
tree | 7664d2f0364120be70157fa5d8f9dccf92d1daab | |
parent | 35bb22d27ec01753a2507d19c3b2822cf29a8dbf (diff) | |
download | mongo-0496aa5ac7e1a99b082f5375c38da9871d08847c.tar.gz |
changed mongoimportJSON to mongoimport - handles json/tsv/csv
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | SConstruct | 2 | ||||
-rw-r--r-- | jstests/tool/tool1.js | 4 | ||||
-rw-r--r-- | tools/export.cpp | 33 | ||||
-rw-r--r-- | tools/import.cpp | 243 | ||||
-rw-r--r-- | tools/importJSON.cpp | 116 | ||||
-rw-r--r-- | tools/tool.cpp | 19 | ||||
-rw-r--r-- | tools/tool.h | 6 |
8 files changed, 280 insertions, 145 deletions
diff --git a/.gitignore b/.gitignore index e6a74705040..453057ef7f7 100644 --- a/.gitignore +++ b/.gitignore @@ -57,7 +57,7 @@ mongorestore mongofiles mongoexport -mongoimportjson +mongoimport mongosniff mongobridge diff --git a/SConstruct b/SConstruct index f37912f2091..135ee402c58 100644 --- a/SConstruct +++ b/SConstruct @@ -840,7 +840,7 @@ env.Program( "mongodump" , allToolFiles + [ "tools/dump.cpp" ] ) env.Program( "mongorestore" , allToolFiles + [ "tools/restore.cpp" ] ) env.Program( "mongoexport" , allToolFiles + [ "tools/export.cpp" ] ) -env.Program( "mongoimportjson" , allToolFiles + [ "tools/importJSON.cpp" ] ) +env.Program( "mongoimport" , allToolFiles + [ "tools/import.cpp" ] ) env.Program( "mongofiles" , allToolFiles + [ "tools/files.cpp" ] ) diff --git a/jstests/tool/tool1.js b/jstests/tool/tool1.js index 173981aa476..00e92e798b7 100644 --- a/jstests/tool/tool1.js +++ b/jstests/tool/tool1.js @@ -37,7 +37,7 @@ runMongoProgram( "mongoexport", "--host", "127.0.0.1:" + port, "-d", baseName, " assert.lt( 10 , fileSize() , "file size changed" ); c.drop(); -runMongoProgram( "mongoimportjson", "--host", "127.0.0.1:" + port, "-d", baseName, "-c", baseName, "--file", externalFile ); +runMongoProgram( "mongoimport", "--host", "127.0.0.1:" + port, "-d", baseName, "-c", baseName, "--file", externalFile ); assert.soon( "c.findOne()" , "mongo import json A" ); assert( c.findOne() && 1 == c.findOne().a , "mongo import json B" ); @@ -57,7 +57,7 @@ resetDbpath( externalPath ); runMongoProgram( "mongoexport", "--dbpath", dbPath, "-d", baseName, "-c", baseName, "--out", externalFile ); resetDbpath( dbPath ); -runMongoProgram( "mongoimportjson", "--dbpath", dbPath, "-d", baseName, "-c", baseName, "--file", externalFile ); +runMongoProgram( "mongoimport", "--dbpath", dbPath, "-d", baseName, "-c", baseName, "--file", externalFile ); m = startMongoProgram( "mongod", "--port", port, "--dbpath", dbPath, "--nohttpinterface", "--bind_ip", "127.0.0.1" ); c = m.getDB( baseName ).getCollection( baseName ); assert.soon( "c.findOne()" , "object missing b" ); diff --git a/tools/export.cpp b/tools/export.cpp index 16c5b93eb22..f142cefecb4 100644 --- a/tools/export.cpp +++ b/tools/export.cpp @@ -26,7 +26,6 @@ #include <iostream> #include <boost/program_options.hpp> -#include <pcrecpp.h> using namespace mongo; @@ -42,7 +41,7 @@ public: ("out,o", po::value<string>(), "output file; if not specified, stdout is used") ; } - + int run(){ string ns; const bool csv = hasParam( "csv" ); @@ -55,8 +54,6 @@ public: BSONObj * fieldsToReturn = 0; BSONObj realFieldsToReturn; - vector<string> fields; - try { ns = getNS(); } catch (...) { @@ -67,24 +64,12 @@ public: auth(); if ( hasParam( "fields" ) ){ - - BSONObjBuilder b; - - string fields_arg = getParam("fields"); - pcrecpp::StringPiece input(fields_arg); - - string f; - pcrecpp::RE re("([\\w\\.]+),?" ); - while ( re.Consume( &input, &f ) ){ - fields.push_back( f ); - b.append( f.c_str() , 1 ); - } - - realFieldsToReturn = b.obj(); - fieldsToReturn = &realFieldsToReturn; + needFields(); + fieldsToReturn = &_fieldsObj; } - if ( csv && fields.size() == 0 ){ + + if ( csv && _fields.size() == 0 ){ cerr << "csv mode requires a field list" << endl; return -1; } @@ -93,8 +78,8 @@ public: auto_ptr<DBClientCursor> cursor = conn().query( ns.c_str() , ((Query)(getParam( "query" , "" ))).snapshot() , 0 , 0 , fieldsToReturn , Option_SlaveOk | Option_NoCursorTimeout ); if ( csv ){ - for ( vector<string>::iterator i=fields.begin(); i != fields.end(); i++ ){ - if ( i != fields.begin() ) + for ( vector<string>::iterator i=_fields.begin(); i != _fields.end(); i++ ){ + if ( i != _fields.begin() ) out << ","; out << *i; } @@ -104,8 +89,8 @@ public: while ( cursor->more() ) { BSONObj obj = cursor->next(); if ( csv ){ - for ( vector<string>::iterator i=fields.begin(); i != fields.end(); i++ ){ - if ( i != fields.begin() ) + for ( vector<string>::iterator i=_fields.begin(); i != _fields.end(); i++ ){ + if ( i != _fields.begin() ) out << ","; const BSONElement & e = obj.getFieldDotted(i->c_str()); if ( ! e.eoo() ){ diff --git a/tools/import.cpp b/tools/import.cpp new file mode 100644 index 00000000000..5bca049146e --- /dev/null +++ b/tools/import.cpp @@ -0,0 +1,243 @@ +// import.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "stdafx.h" +#include "client/dbclient.h" +#include "db/json.h" + +#include "tool.h" + +#include <fstream> +#include <iostream> + +#include <boost/program_options.hpp> + +using namespace mongo; + +namespace po = boost::program_options; + +class Import : public Tool { + + enum Type { JSON , CSV , TSV }; + Type _type; + + const char * _sep; + + bool _appendNumber( BSONObjBuilder& b , const string& fieldName , const string& data ){ + if ( data.size() == 0 ) + return false; + + unsigned int pos=0; + if ( data[0] == '-' ) + pos++; + + bool hasDec = false; + + for ( ; pos<data.size(); pos++ ){ + if ( isdigit(data[pos]) ) + continue; + + if ( data[pos] == '.' ){ + if ( hasDec ) + return false; + hasDec = true; + continue; + } + + return false; + } + + if ( hasDec ){ + double d = atof( data.c_str() ); + b.append( fieldName.c_str() , d ); + return true; + } + + if ( data.size() < 8 ){ + b.append( fieldName , atoi( data.c_str() ) ); + return true; + } + + b.append( fieldName , atol( data.c_str() ) ); + return true; + } + + void _append( BSONObjBuilder& b , const string& fieldName , const string& data ){ + if ( _appendNumber( b , fieldName , data ) ) + return; + + // TODO: other types? + b.append( fieldName.c_str() , data ); + } + + BSONObj parseLine( const char * line ){ + if ( _type == JSON ) + return fromjson( line ); + + BSONObjBuilder b; + + unsigned int pos=0; + while ( line[0] ){ + string name; + if ( pos < _fields.size() ){ + name = _fields[pos]; + } + else { + stringstream ss; + ss << "field" << pos; + name = ss.str(); + } + pos++; + + const char * end = strstr( line , _sep ); + if ( ! end ){ + _append( b , name , string( line ) ); + break; + } + + _append( b , name , string( line , end - line ) ); + line = end + 1; + } + + return b.obj(); + } + +public: + Import() : Tool( "import" ){ + add_options() + ("type",po::value<string>() , "type of file to import. default: json (json,csv,tsv)") + ("fields,f" , po::value<string>() , "comma seperated list of field names e.g. -f name,age" ) + ("file",po::value<string>() , "file to import from; if not specified stdin is used" ) + ("drop", "drop collection first " ) + ; + addPositionArg( "file" , 1 ); + _type = JSON; + } + + int run(){ + string filename = getParam( "file" ); + long long fileSize = -1; + + istream * in = &cin; + + ifstream file( filename.c_str() , ios_base::in | ios_base::binary); + + if ( filename.size() > 0 && filename != "-" ){ + if ( ! exists( filename ) ){ + cerr << "file doesn't exist: " << filename << endl; + return -1; + } + in = &file; + fileSize = file_size( filename ); + } + + string ns; + + try { + ns = getNS(); + } catch (...) { + printHelp(cerr); + return -1; + } + + auth(); + + if ( hasParam( "drop" ) ){ + cout << "dropping: " << ns << endl; + conn().dropCollection( ns.c_str() ); + } + + if ( hasParam( "type" ) ){ + string type = getParam( "type" ); + if ( type == "json" ) + _type = JSON; + else if ( type == "csv" ){ + _type = CSV; + _sep = ","; + } + else if ( type == "tsv" ){ + _type = TSV; + _sep = "\t"; + } + else { + cerr << "don't know what type [" << type << "] is" << endl; + return -1; + } + } + + if ( _type == CSV || _type == TSV ){ + if ( ! hasParam( "fields" ) ){ + cerr << "need to speicfy fields for csv and tsv" << endl; + return -1; + } + needFields(); + } + + int errors = 0; + + int num = 0; + + time_t start = time(0); + + ProgressMeter pm( fileSize ); + const int BUF_SIZE = 1024 * 1024 * 4; + char line[ (1024 * 1024 * 4) + 128]; + while ( *in ){ + in->getline( line , BUF_SIZE ); + + char * buf = line; + while( isspace( buf[0] ) ) buf++; + + int len = strlen( buf ); + if ( ! len ) + continue; + + if ( in->rdstate() == ios_base::eofbit ) + break; + assert( in->rdstate() == 0 ); + + try { + BSONObj o = parseLine( buf ); + conn().insert( ns.c_str() , o ); + } + catch ( std::exception& e ){ + cout << "exception:" << e.what() << endl; + cout << buf << endl; + errors++; + } + + num++; + if ( pm.hit( len + 1 ) ){ + cout << "\t\t\t" << num << "\t" << ( num / ( time(0) - start ) ) << "/second" << endl; + } + } + + cout << "imported " << num << " objects" << endl; + + if ( errors == 0 ) + return 0; + + cerr << "encountered " << errors << " error" << ( errors == 1 ? "" : "s" ) << endl; + return -1; + } +}; + +int main( int argc , char ** argv ) { + Import import; + return import.main( argc , argv ); +} diff --git a/tools/importJSON.cpp b/tools/importJSON.cpp deleted file mode 100644 index e54c18b0fc3..00000000000 --- a/tools/importJSON.cpp +++ /dev/null @@ -1,116 +0,0 @@ -// importJSON.cpp - -/** -* Copyright (C) 2008 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -#include "stdafx.h" -#include "client/dbclient.h" -#include "db/json.h" - -#include "tool.h" - -#include <fstream> -#include <iostream> - -#include <boost/program_options.hpp> - -using namespace mongo; - -namespace po = boost::program_options; - -class ImportJSON : public Tool { -public: - ImportJSON() : Tool( "importjson" ){ - add_options() - ("file",po::value<string>() , "file to import from; if not specified stdin is used" ) - ("drop", "drop collection first " ) - ; - addPositionArg( "file" , 1 ); - } - - int run(){ - string filename = getParam( "file" ); - long long fileSize = -1; - - istream * in = &cin; - - ifstream file( filename.c_str() , ios_base::in | ios_base::binary); - - if ( filename.size() > 0 && filename != "-" ){ - in = &file; - fileSize = file_size( filename ); - } - - string ns; - - try { - ns = getNS(); - } catch (...) { - printHelp(cerr); - return -1; - } - - auth(); - - if ( hasParam( "drop" ) ){ - cout << "dropping: " << ns << endl; - conn().dropCollection( ns.c_str() ); - } - - int num = 0; - - time_t start = time(0); - - ProgressMeter pm( fileSize ); - const int BUF_SIZE = 1024 * 1024 * 4; - char line[ (1024 * 1024 * 4) + 128]; - while ( *in ){ - in->getline( line , BUF_SIZE ); - - char * buf = line; - while( isspace( buf[0] ) ) buf++; - - int len = strlen( buf ); - if ( ! len ) - continue; - - if ( in->rdstate() == ios_base::eofbit ) - break; - assert( in->rdstate() == 0 ); - - try { - BSONObj o = fromjson( buf ); - conn().insert( ns.c_str() , o ); - } - catch ( MsgAssertionException& ma ){ - cout << "exception:" << ma.toString() << endl; - cout << buf << endl; - } - - num++; - if ( pm.hit( len + 1 ) ){ - cout << "\t\t\t" << num << "\t" << ( num / ( time(0) - start ) ) << "/second" << endl; - } - } - - return 0; - } -}; - -int main( int argc , char ** argv ) { - ImportJSON import; - return import.main( argc , argv ); -} diff --git a/tools/tool.cpp b/tools/tool.cpp index 8aa1cfaac98..5784099a956 100644 --- a/tools/tool.cpp +++ b/tools/tool.cpp @@ -5,6 +5,7 @@ #include <iostream> #include <boost/filesystem/operations.hpp> +#include <pcrecpp.h> #include "util/file_allocator.h" @@ -156,6 +157,24 @@ mongo::DBClientBase& mongo::Tool::conn( bool slaveIfPaired ){ return *_conn; } +void mongo::Tool::needFields(){ + uassert( "you need to specify fields" , hasParam( "fields" ) ); + + BSONObjBuilder b; + + string fields_arg = getParam("fields"); + pcrecpp::StringPiece input(fields_arg); + + string f; + pcrecpp::RE re("([\\w\\.]+),?" ); + while ( re.Consume( &input, &f ) ){ + _fields.push_back( f ); + b.append( f.c_str() , 1 ); + } + + _fieldsObj = b.obj(); +} + void mongo::Tool::auth( string dbname ){ if ( ! dbname.size() ) dbname = _db; diff --git a/tools/tool.h b/tools/tool.h index a398ed5a6d8..c6d2db8780a 100644 --- a/tools/tool.h +++ b/tools/tool.h @@ -61,7 +61,7 @@ namespace mongo { mongo::DBClientBase &conn( bool slaveIfPaired = false ); void auth( string db = "" ); - + string _name; string _db; @@ -70,7 +70,11 @@ namespace mongo { string _username; string _password; + void needFields(); + vector<string> _fields; + BSONObj _fieldsObj; + private: string _host; mongo::DBClientBase * _conn; |