summaryrefslogtreecommitdiff
path: root/src/mongo/util/text.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/mongo/util/text.cpp')
-rw-r--r--src/mongo/util/text.cpp565
1 files changed, 280 insertions, 285 deletions
diff --git a/src/mongo/util/text.cpp b/src/mongo/util/text.cpp
index 264c9a0e771..24bc71cf059 100644
--- a/src/mongo/util/text.cpp
+++ b/src/mongo/util/text.cpp
@@ -46,316 +46,312 @@ using namespace std;
namespace mongo {
- // --- StringSplitter ----
-
- /** get next split string fragment */
- string StringSplitter::next() {
- const char * foo = strstr( _big , _splitter );
- if ( foo ) {
- string s( _big , foo - _big );
- _big = foo + strlen( _splitter );
- while ( *_big && strstr( _big , _splitter ) == _big )
- _big++;
- return s;
- }
-
- string s = _big;
- _big += strlen( _big );
+// --- StringSplitter ----
+
+/** get next split string fragment */
+string StringSplitter::next() {
+ const char* foo = strstr(_big, _splitter);
+ if (foo) {
+ string s(_big, foo - _big);
+ _big = foo + strlen(_splitter);
+ while (*_big && strstr(_big, _splitter) == _big)
+ _big++;
return s;
}
+ string s = _big;
+ _big += strlen(_big);
+ return s;
+}
- void StringSplitter::split( vector<string>& l ) {
- while ( more() ) {
- l.push_back( next() );
- }
- }
- vector<string> StringSplitter::split() {
- vector<string> l;
- split( l );
- return l;
+void StringSplitter::split(vector<string>& l) {
+ while (more()) {
+ l.push_back(next());
}
+}
- string StringSplitter::join( const vector<string>& l , const string& split ) {
- stringstream ss;
- for ( unsigned i=0; i<l.size(); i++ ) {
- if ( i > 0 )
- ss << split;
- ss << l[i];
- }
- return ss.str();
- }
+vector<string> StringSplitter::split() {
+ vector<string> l;
+ split(l);
+ return l;
+}
- vector<string> StringSplitter::split( const string& big , const string& splitter ) {
- StringSplitter ss( big.c_str() , splitter.c_str() );
- return ss.split();
+string StringSplitter::join(const vector<string>& l, const string& split) {
+ stringstream ss;
+ for (unsigned i = 0; i < l.size(); i++) {
+ if (i > 0)
+ ss << split;
+ ss << l[i];
}
-
-
-
- // --- utf8 utils ------
-
- inline int leadingOnes(unsigned char c) {
- if (c < 0x80) return 0;
- static const char _leadingOnes[128] = {
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80 - 0x8F
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90 - 0x99
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xA0 - 0xA9
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xB0 - 0xB9
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xC0 - 0xC9
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xD0 - 0xD9
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE0 - 0xE9
- 4, 4, 4, 4, 4, 4, 4, 4, // 0xF0 - 0xF7
- 5, 5, 5, 5, // 0xF8 - 0xFB
- 6, 6, // 0xFC - 0xFD
- 7, // 0xFE
- 8, // 0xFF
- };
- return _leadingOnes[c & 0x7f];
+ return ss.str();
+}
- }
+vector<string> StringSplitter::split(const string& big, const string& splitter) {
+ StringSplitter ss(big.c_str(), splitter.c_str());
+ return ss.split();
+}
- bool isValidUTF8(const std::string& s) {
- return isValidUTF8(s.c_str());
- }
- bool isValidUTF8(const char *s) {
- int left = 0; // how many bytes are left in the current codepoint
- while (*s) {
- const unsigned char c = (unsigned char) *(s++);
- const int ones = leadingOnes(c);
- if (left) {
- if (ones != 1) return false; // should be a continuation byte
- left--;
- }
- else {
- if (ones == 0) continue; // ASCII byte
- if (ones == 1) return false; // unexpected continuation byte
- if (c > 0xF4) return false; // codepoint too large (< 0x10FFFF)
- if (c == 0xC0 || c == 0xC1) return false; // codepoints <= 0x7F shouldn't be 2 bytes
-
- // still valid
- left = ones-1;
- }
+// --- utf8 utils ------
+
+inline int leadingOnes(unsigned char c) {
+ if (c < 0x80)
+ return 0;
+ static const char _leadingOnes[128] = {
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80 - 0x8F
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90 - 0x99
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xA0 - 0xA9
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xB0 - 0xB9
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xC0 - 0xC9
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xD0 - 0xD9
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE0 - 0xE9
+ 4, 4, 4, 4, 4, 4, 4, 4, // 0xF0 - 0xF7
+ 5, 5, 5, 5, // 0xF8 - 0xFB
+ 6, 6, // 0xFC - 0xFD
+ 7, // 0xFE
+ 8, // 0xFF
+ };
+ return _leadingOnes[c & 0x7f];
+}
+
+bool isValidUTF8(const std::string& s) {
+ return isValidUTF8(s.c_str());
+}
+
+bool isValidUTF8(const char* s) {
+ int left = 0; // how many bytes are left in the current codepoint
+ while (*s) {
+ const unsigned char c = (unsigned char)*(s++);
+ const int ones = leadingOnes(c);
+ if (left) {
+ if (ones != 1)
+ return false; // should be a continuation byte
+ left--;
+ } else {
+ if (ones == 0)
+ continue; // ASCII byte
+ if (ones == 1)
+ return false; // unexpected continuation byte
+ if (c > 0xF4)
+ return false; // codepoint too large (< 0x10FFFF)
+ if (c == 0xC0 || c == 0xC1)
+ return false; // codepoints <= 0x7F shouldn't be 2 bytes
+
+ // still valid
+ left = ones - 1;
}
- if (left!=0) return false; // string ended mid-codepoint
- return true;
}
+ if (left != 0)
+ return false; // string ended mid-codepoint
+ return true;
+}
- long long parseLL( const char *n ) {
- long long ret;
- uassert( 13307, "cannot convert empty string to long long", *n != 0 );
+long long parseLL(const char* n) {
+ long long ret;
+ uassert(13307, "cannot convert empty string to long long", *n != 0);
#if !defined(_WIN32)
- char *endPtr = 0;
- errno = 0;
- ret = strtoll( n, &endPtr, 10 );
- uassert( 13305, "could not convert string to long long", *endPtr == 0 && errno == 0 );
+ char* endPtr = 0;
+ errno = 0;
+ ret = strtoll(n, &endPtr, 10);
+ uassert(13305, "could not convert string to long long", *endPtr == 0 && errno == 0);
#else
- size_t endLen = 0;
- try {
- ret = stoll( n, &endLen, 10 );
- }
- catch ( ... ) {
- endLen = 0;
- }
- uassert( 13306, "could not convert string to long long", endLen != 0 && n[ endLen ] == 0 );
-#endif // !defined(_WIN32)
- return ret;
+ size_t endLen = 0;
+ try {
+ ret = stoll(n, &endLen, 10);
+ } catch (...) {
+ endLen = 0;
}
+ uassert(13306, "could not convert string to long long", endLen != 0 && n[endLen] == 0);
+#endif // !defined(_WIN32)
+ return ret;
+}
#if defined(_WIN32)
- std::string toUtf8String(const std::wstring& wide) {
- if (wide.size() > boost::integer_traits<int>::const_max)
- throw std::length_error(
- "Wide string cannot be more than INT_MAX characters long.");
- if (wide.size() == 0)
- return "";
-
- // Calculate necessary buffer size
- int len = ::WideCharToMultiByte(
- CP_UTF8, 0, wide.c_str(), static_cast<int>(wide.size()),
- NULL, 0, NULL, NULL);
+std::string toUtf8String(const std::wstring& wide) {
+ if (wide.size() > boost::integer_traits<int>::const_max)
+ throw std::length_error("Wide string cannot be more than INT_MAX characters long.");
+ if (wide.size() == 0)
+ return "";
- // Perform actual conversion
+ // Calculate necessary buffer size
+ int len = ::WideCharToMultiByte(
+ CP_UTF8, 0, wide.c_str(), static_cast<int>(wide.size()), NULL, 0, NULL, NULL);
+
+ // Perform actual conversion
+ if (len > 0) {
+ std::vector<char> buffer(len);
+ len = ::WideCharToMultiByte(CP_UTF8,
+ 0,
+ wide.c_str(),
+ static_cast<int>(wide.size()),
+ &buffer[0],
+ static_cast<int>(buffer.size()),
+ NULL,
+ NULL);
if (len > 0) {
- std::vector<char> buffer(len);
- len = ::WideCharToMultiByte(
- CP_UTF8, 0, wide.c_str(), static_cast<int>(wide.size()),
- &buffer[0], static_cast<int>(buffer.size()), NULL, NULL);
- if (len > 0) {
- verify(len == static_cast<int>(buffer.size()));
- return std::string(&buffer[0], buffer.size());
- }
+ verify(len == static_cast<int>(buffer.size()));
+ return std::string(&buffer[0], buffer.size());
}
-
- msgasserted( 16091 ,
- mongoutils::str::stream() << "can't wstring to utf8: " << ::GetLastError() );
- return "";
}
- std::wstring toWideString(const char *utf8String) {
- int bufferSize = MultiByteToWideChar(
- CP_UTF8, // Code page
- 0, // Flags
- utf8String, // Input string
- -1, // Count, -1 for NUL-terminated
- NULL, // No output buffer
- 0 // Zero means "compute required size"
- );
- if ( bufferSize == 0 ) {
- return std::wstring();
- }
- std::unique_ptr< wchar_t []> tempBuffer( new wchar_t[ bufferSize ] );
- tempBuffer[0] = 0;
- MultiByteToWideChar(
- CP_UTF8, // Code page
- 0, // Flags
- utf8String, // Input string
- -1, // Count, -1 for NUL-terminated
- tempBuffer.get(), // UTF-16 output buffer
- bufferSize // Buffer size in wide characters
- );
- return std::wstring( tempBuffer.get() );
+ msgasserted(16091, mongoutils::str::stream() << "can't wstring to utf8: " << ::GetLastError());
+ return "";
+}
+
+std::wstring toWideString(const char* utf8String) {
+ int bufferSize = MultiByteToWideChar(CP_UTF8, // Code page
+ 0, // Flags
+ utf8String, // Input string
+ -1, // Count, -1 for NUL-terminated
+ NULL, // No output buffer
+ 0 // Zero means "compute required size"
+ );
+ if (bufferSize == 0) {
+ return std::wstring();
}
+ std::unique_ptr<wchar_t[]> tempBuffer(new wchar_t[bufferSize]);
+ tempBuffer[0] = 0;
+ MultiByteToWideChar(CP_UTF8, // Code page
+ 0, // Flags
+ utf8String, // Input string
+ -1, // Count, -1 for NUL-terminated
+ tempBuffer.get(), // UTF-16 output buffer
+ bufferSize // Buffer size in wide characters
+ );
+ return std::wstring(tempBuffer.get());
+}
- /**
- * Write a UTF-8 string to the Windows console in Unicode (UTF-16)
- *
- * @param utf8String UTF-8 input string
- * @param utf8StringSize Number of bytes in UTF-8 string, no NUL terminator assumed
- * @return true if all characters were displayed (including zero characters)
- */
- bool writeUtf8ToWindowsConsole( const char* utf8String, unsigned int utf8StringSize ) {
- int bufferSize = MultiByteToWideChar(
- CP_UTF8, // Code page
- 0, // Flags
- utf8String, // Input string
- utf8StringSize, // Input string length
- NULL, // No output buffer
- 0 // Zero means "compute required size"
- );
- if ( bufferSize == 0 ) {
- return true;
+/**
+ * Write a UTF-8 string to the Windows console in Unicode (UTF-16)
+ *
+ * @param utf8String UTF-8 input string
+ * @param utf8StringSize Number of bytes in UTF-8 string, no NUL terminator assumed
+ * @return true if all characters were displayed (including zero characters)
+ */
+bool writeUtf8ToWindowsConsole(const char* utf8String, unsigned int utf8StringSize) {
+ int bufferSize = MultiByteToWideChar(CP_UTF8, // Code page
+ 0, // Flags
+ utf8String, // Input string
+ utf8StringSize, // Input string length
+ NULL, // No output buffer
+ 0 // Zero means "compute required size"
+ );
+ if (bufferSize == 0) {
+ return true;
+ }
+ std::unique_ptr<wchar_t[]> utf16String(new wchar_t[bufferSize]);
+ MultiByteToWideChar(CP_UTF8, // Code page
+ 0, // Flags
+ utf8String, // Input string
+ utf8StringSize, // Input string length
+ utf16String.get(), // UTF-16 output buffer
+ bufferSize // Buffer size in wide characters
+ );
+ const wchar_t* utf16Pointer = utf16String.get();
+ size_t numberOfCharactersToWrite = bufferSize;
+ HANDLE consoleHandle = GetStdHandle(STD_OUTPUT_HANDLE);
+ while (numberOfCharactersToWrite > 0) {
+ static const DWORD MAXIMUM_CHARACTERS_PER_PASS = 8 * 1024;
+ DWORD numberOfCharactersThisPass = static_cast<DWORD>(numberOfCharactersToWrite);
+ if (numberOfCharactersThisPass > MAXIMUM_CHARACTERS_PER_PASS) {
+ numberOfCharactersThisPass = MAXIMUM_CHARACTERS_PER_PASS;
}
- std::unique_ptr<wchar_t[]> utf16String( new wchar_t[ bufferSize ] );
- MultiByteToWideChar(
- CP_UTF8, // Code page
- 0, // Flags
- utf8String, // Input string
- utf8StringSize, // Input string length
- utf16String.get(), // UTF-16 output buffer
- bufferSize // Buffer size in wide characters
- );
- const wchar_t* utf16Pointer = utf16String.get();
- size_t numberOfCharactersToWrite = bufferSize;
- HANDLE consoleHandle = GetStdHandle( STD_OUTPUT_HANDLE );
- while ( numberOfCharactersToWrite > 0 ) {
- static const DWORD MAXIMUM_CHARACTERS_PER_PASS = 8 * 1024;
- DWORD numberOfCharactersThisPass = static_cast<DWORD>( numberOfCharactersToWrite );
- if ( numberOfCharactersThisPass > MAXIMUM_CHARACTERS_PER_PASS ) {
- numberOfCharactersThisPass = MAXIMUM_CHARACTERS_PER_PASS;
- }
- DWORD numberOfCharactersWritten;
- BOOL success = WriteConsoleW( consoleHandle,
- utf16Pointer,
- numberOfCharactersThisPass,
- &numberOfCharactersWritten,
- NULL );
- if ( 0 == success ) {
- DWORD dosError = GetLastError();
- static bool errorMessageShown = false;
- if ( ERROR_GEN_FAILURE == dosError ) {
- if ( ! errorMessageShown ) {
- std::cout << "\n---\nUnicode text could not be correctly displayed.\n"
- "Please change your console font to a Unicode font "
- "(e.g. Lucida Console).\n---\n" << std::endl;
- errorMessageShown = true;
- }
- // we can't display the text properly using a raster font,
- // but we can display the bits that will display ...
- _write( 1, utf8String, utf8StringSize );
+ DWORD numberOfCharactersWritten;
+ BOOL success = WriteConsoleW(consoleHandle,
+ utf16Pointer,
+ numberOfCharactersThisPass,
+ &numberOfCharactersWritten,
+ NULL);
+ if (0 == success) {
+ DWORD dosError = GetLastError();
+ static bool errorMessageShown = false;
+ if (ERROR_GEN_FAILURE == dosError) {
+ if (!errorMessageShown) {
+ std::cout << "\n---\nUnicode text could not be correctly displayed.\n"
+ "Please change your console font to a Unicode font "
+ "(e.g. Lucida Console).\n---\n" << std::endl;
+ errorMessageShown = true;
}
- return false;
+ // we can't display the text properly using a raster font,
+ // but we can display the bits that will display ...
+ _write(1, utf8String, utf8StringSize);
}
- numberOfCharactersToWrite -= numberOfCharactersWritten;
- utf16Pointer += numberOfCharactersWritten;
+ return false;
}
- return true;
+ numberOfCharactersToWrite -= numberOfCharactersWritten;
+ utf16Pointer += numberOfCharactersWritten;
}
+ return true;
+}
- WindowsCommandLine::WindowsCommandLine(int argc, wchar_t* argvW[], wchar_t* envpW[]) :
- _argv(NULL), _envp(NULL) {
-
- // Construct UTF-8 copy of arguments
- vector<string> utf8args;
- vector<size_t> utf8argLength;
- size_t blockSize = argc * sizeof(char*);
- size_t blockPtr = blockSize;
- for (int i = 0; i < argc; ++i) {
- utf8args.push_back( toUtf8String(argvW[i]) );
- size_t argLength = utf8args[i].length() + 1;
- utf8argLength.push_back(argLength);
- blockSize += argLength;
- }
- _argv = static_cast<char**>(mongoMalloc(blockSize));
- for (int i = 0; i < argc; ++i) {
- _argv[i] = reinterpret_cast<char*>(_argv) + blockPtr;
- strcpy_s(_argv[i], utf8argLength[i], utf8args[i].c_str());
- blockPtr += utf8argLength[i];
- }
-
- // Construct UTF-8 copy of environment strings
- size_t envCount = 0;
- wchar_t** envpWptr = &envpW[0];
- while (*envpWptr++) {
- ++envCount;
- }
- vector<string> utf8envs;
- vector<size_t> utf8envLength;
- blockSize = (envCount + 1) * sizeof(char*);
- blockPtr = blockSize;
- for (size_t i = 0; i < envCount; ++i) {
- utf8envs.push_back( toUtf8String(envpW[i]) );
- size_t envLength = utf8envs[i].length() + 1;
- utf8envLength.push_back(envLength);
- blockSize += envLength;
- }
- _envp = static_cast<char**>(mongoMalloc(blockSize));
- size_t i;
- for (i = 0; i < envCount; ++i) {
- _envp[i] = reinterpret_cast<char*>(_envp) + blockPtr;
- strcpy_s(_envp[i], utf8envLength[i], utf8envs[i].c_str());
- blockPtr += utf8envLength[i];
- }
- _envp[i] = NULL;
+WindowsCommandLine::WindowsCommandLine(int argc, wchar_t* argvW[], wchar_t* envpW[])
+ : _argv(NULL), _envp(NULL) {
+ // Construct UTF-8 copy of arguments
+ vector<string> utf8args;
+ vector<size_t> utf8argLength;
+ size_t blockSize = argc * sizeof(char*);
+ size_t blockPtr = blockSize;
+ for (int i = 0; i < argc; ++i) {
+ utf8args.push_back(toUtf8String(argvW[i]));
+ size_t argLength = utf8args[i].length() + 1;
+ utf8argLength.push_back(argLength);
+ blockSize += argLength;
}
-
- WindowsCommandLine::~WindowsCommandLine() {
- free(_argv);
- free(_envp);
+ _argv = static_cast<char**>(mongoMalloc(blockSize));
+ for (int i = 0; i < argc; ++i) {
+ _argv[i] = reinterpret_cast<char*>(_argv) + blockPtr;
+ strcpy_s(_argv[i], utf8argLength[i], utf8args[i].c_str());
+ blockPtr += utf8argLength[i];
}
-#endif // #if defined(_WIN32)
+ // Construct UTF-8 copy of environment strings
+ size_t envCount = 0;
+ wchar_t** envpWptr = &envpW[0];
+ while (*envpWptr++) {
+ ++envCount;
+ }
+ vector<string> utf8envs;
+ vector<size_t> utf8envLength;
+ blockSize = (envCount + 1) * sizeof(char*);
+ blockPtr = blockSize;
+ for (size_t i = 0; i < envCount; ++i) {
+ utf8envs.push_back(toUtf8String(envpW[i]));
+ size_t envLength = utf8envs[i].length() + 1;
+ utf8envLength.push_back(envLength);
+ blockSize += envLength;
+ }
+ _envp = static_cast<char**>(mongoMalloc(blockSize));
+ size_t i;
+ for (i = 0; i < envCount; ++i) {
+ _envp[i] = reinterpret_cast<char*>(_envp) + blockPtr;
+ strcpy_s(_envp[i], utf8envLength[i], utf8envs[i].c_str());
+ blockPtr += utf8envLength[i];
+ }
+ _envp[i] = NULL;
+}
- // See "Parsing C++ Command-Line Arguments (C++)"
- // http://msdn.microsoft.com/en-us/library/windows/desktop/17w5ykft(v=vs.85).aspx
- static void quoteForWindowsCommandLine(const std::string& arg, std::ostream& os) {
- if (arg.empty()) {
- os << "\"\"";
- }
- else if (arg.find_first_of(" \t\"") == std::string::npos) {
- os << arg;
- }
- else {
- os << '"';
- std::string backslashes = "";
- for (std::string::const_iterator iter = arg.begin(), end = arg.end();
- iter != end; ++iter) {
+WindowsCommandLine::~WindowsCommandLine() {
+ free(_argv);
+ free(_envp);
+}
- switch (*iter) {
+#endif // #if defined(_WIN32)
+
+// See "Parsing C++ Command-Line Arguments (C++)"
+// http://msdn.microsoft.com/en-us/library/windows/desktop/17w5ykft(v=vs.85).aspx
+static void quoteForWindowsCommandLine(const std::string& arg, std::ostream& os) {
+ if (arg.empty()) {
+ os << "\"\"";
+ } else if (arg.find_first_of(" \t\"") == std::string::npos) {
+ os << arg;
+ } else {
+ os << '"';
+ std::string backslashes = "";
+ for (std::string::const_iterator iter = arg.begin(), end = arg.end(); iter != end; ++iter) {
+ switch (*iter) {
case '\\':
backslashes.push_back(*iter);
if (iter + 1 == end)
@@ -368,26 +364,25 @@ namespace mongo {
os << backslashes << *iter;
backslashes.clear();
break;
- }
}
- os << '"';
}
+ os << '"';
}
+}
- std::string constructUtf8WindowsCommandLine(const std::vector<std::string>& argv) {
- if (argv.empty())
- return "";
+std::string constructUtf8WindowsCommandLine(const std::vector<std::string>& argv) {
+ if (argv.empty())
+ return "";
- std::ostringstream commandLine;
- std::vector<std::string>::const_iterator iter = argv.begin();
- std::vector<std::string>::const_iterator end = argv.end();
+ std::ostringstream commandLine;
+ std::vector<std::string>::const_iterator iter = argv.begin();
+ std::vector<std::string>::const_iterator end = argv.end();
+ quoteForWindowsCommandLine(*iter, commandLine);
+ ++iter;
+ for (; iter != end; ++iter) {
+ commandLine << ' ';
quoteForWindowsCommandLine(*iter, commandLine);
- ++iter;
- for (; iter != end; ++iter) {
- commandLine << ' ';
- quoteForWindowsCommandLine(*iter, commandLine);
- }
- return commandLine.str();
}
+ return commandLine.str();
+}
}
-