diff options
author | Adam Chelminski <adam.chelminski@mongodb.com> | 2015-08-13 11:16:35 -0400 |
---|---|---|
committer | Adam Chelminski <adam.chelminski@mongodb.com> | 2015-08-14 13:50:48 -0400 |
commit | a0bbce24216ee2a8a30ef606a76c663d61aacff0 (patch) | |
tree | 0f7cdbd9d385ace60abbcd5834636ca269df219d /src/mongo/db/fts/unicode | |
parent | 3211eea8dbfe317ad3e1434abf1a4cd7190a1b1c (diff) | |
download | mongo-a0bbce24216ee2a8a30ef606a76c663d61aacff0.tar.gz |
SERVER-19944 Improve text index v3 performance
Diffstat (limited to 'src/mongo/db/fts/unicode')
-rw-r--r-- | src/mongo/db/fts/unicode/gen_delimiter_list.py | 7 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/string.cpp | 87 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/string.h | 38 |
3 files changed, 108 insertions, 24 deletions
diff --git a/src/mongo/db/fts/unicode/gen_delimiter_list.py b/src/mongo/db/fts/unicode/gen_delimiter_list.py index 52b79544c6b..5b678400e19 100644 --- a/src/mongo/db/fts/unicode/gen_delimiter_list.py +++ b/src/mongo/db/fts/unicode/gen_delimiter_list.py @@ -65,6 +65,13 @@ DelimiterListLanguage lang) { return false; } + // Most characters are latin letters, so filter those out first. + if (codepoint >= 'A' && codepoint <= 'Z') { + return false; + } else if (codepoint >= 'a' && codepoint <= 'z') { + return false; + } + switch (codepoint) {\n""") for delim in sorted(delim_codepoints): diff --git a/src/mongo/db/fts/unicode/string.cpp b/src/mongo/db/fts/unicode/string.cpp index 9f29749edbc..9c18a776787 100644 --- a/src/mongo/db/fts/unicode/string.cpp +++ b/src/mongo/db/fts/unicode/string.cpp @@ -42,10 +42,16 @@ using linenoise_utf8::copyString8to32; using std::u32string; String::String(const StringData utf8_src) { + // Reserve space for underlying buffers to prevent excessive reallocations. + _outputBuf.reserve(utf8_src.size() * 4); + _data.reserve(utf8_src.size() * 4); + + // Convert UTF-8 input to UTF-32 data. setData(utf8_src); } void String::resetData(const StringData utf8_src) { + // Convert UTF-8 input to UTF-32 data. setData(utf8_src); } @@ -70,20 +76,28 @@ void String::setData(const StringData utf8_src) { // Resize _data so it is only as big as what it contains. _data.resize(resultSize); + _needsOutputConversion = true; } -String::String(u32string&& src) : _data(std::move(src)) {} - -std::string String::toString() const { - // output is the target, resize it so that it's guaranteed to fit all of the input characters, - // plus a null character if there isn't one. - std::string output(_data.size() * 4 + 1, '\0'); - size_t resultSize = - copyString32to8(reinterpret_cast<unsigned char*>(&output[0]), &_data[0], output.size()); +String::String(u32string&& src) : _data(std::move(src)), _needsOutputConversion(true) { + // Reserve space for underlying buffers to prevent excessive reallocations. + _outputBuf.reserve(src.size() * 4); + _data.reserve(src.size() * 4); +} - // Resize output so it is only as large as what it contains. - output.resize(resultSize); - return output; +std::string String::toString() { + // _outputBuf is the target, resize it so that it's guaranteed to fit all of the input + // characters, plus a null character if there isn't one. + if (_needsOutputConversion) { + _outputBuf.resize(_data.size() * 4 + 1); + size_t resultSize = copyString32to8( + reinterpret_cast<unsigned char*>(&_outputBuf[0]), &_data[0], _outputBuf.size()); + + // Resize output so it is only as large as what it contains. + _outputBuf.resize(resultSize); + _needsOutputConversion = false; + } + return _outputBuf; } size_t String::size() const { @@ -95,30 +109,61 @@ const char32_t& String::operator[](int i) const { } String String::substr(size_t pos, size_t len) const { - return String(_data.substr(pos, len)); + unicode::String buf; + substrToBuf(pos, len, buf); + return buf; } String String::toLower(CaseFoldMode mode) const { - u32string newdata(_data.size(), 0); + unicode::String buf; + toLowerToBuf(mode, buf); + return buf; +} + +String String::removeDiacritics() const { + unicode::String buf; + removeDiacriticsToBuf(buf); + return buf; +} + +void String::copyToBuf(String& buffer) const { + buffer._data = _data; + buffer._data.resize(_data.size()); auto index = 0; for (auto codepoint : _data) { - newdata[index++] = codepointToLower(codepoint, mode); + buffer._data[index++] = codepoint; } + buffer._needsOutputConversion = true; +} - return String(std::move(newdata)); +void String::substrToBuf(size_t pos, size_t len, String& buffer) const { + buffer._data.resize(len + 1); + for (size_t index = 0, src_pos = pos; index < len;) { + buffer._data[index++] = _data[src_pos++]; + } + buffer._data[len] = '\0'; + buffer._needsOutputConversion = true; } -String String::removeDiacritics() const { - u32string newdata(_data.size(), 0); +void String::toLowerToBuf(CaseFoldMode mode, String& buffer) const { + buffer._data.resize(_data.size()); + auto index = 0; + for (auto codepoint : _data) { + buffer._data[index++] = codepointToLower(codepoint, mode); + } + buffer._needsOutputConversion = true; +} + +void String::removeDiacriticsToBuf(String& buffer) const { + buffer._data.resize(_data.size()); auto index = 0; for (auto codepoint : _data) { if (!codepointIsDiacritic(codepoint)) { - newdata[index++] = codepointRemoveDiacritics(codepoint); + buffer._data[index++] = codepointRemoveDiacritics(codepoint); } } - - newdata.resize(index); - return String(std::move(newdata)); + buffer._data.resize(index); + buffer._needsOutputConversion = true; } bool String::substrMatch(const String& str, diff --git a/src/mongo/db/fts/unicode/string.h b/src/mongo/db/fts/unicode/string.h index ddfa6f93870..c3355ee4f25 100644 --- a/src/mongo/db/fts/unicode/string.h +++ b/src/mongo/db/fts/unicode/string.h @@ -67,7 +67,7 @@ public: void resetData(const StringData utf8_src); /** - * Return a lowercased version of the String instance using the Unicode data in u_data.h. + * Return a lowercased version of the String instance. */ String toLower(CaseFoldMode mode = CaseFoldMode::kNormal) const; @@ -82,9 +82,30 @@ public: String substr(size_t begin, size_t end) const; /** - * Returns a UTF-8 encoded std::string version of the String instance. + * Copies the current String to another String. */ - std::string toString() const; + void copyToBuf(String& buffer) const; + + /** + * Takes a substring of the current String and puts it in another String. + */ + void substrToBuf(size_t pos, size_t len, String& buffer) const; + + /** + * Lowercases the current String and stores the result in another String. + */ + void toLowerToBuf(CaseFoldMode mode, String& buffer) const; + + /** + * Removes diacritics from the current String and stores the result in another String. + */ + void removeDiacriticsToBuf(String& buffer) const; + + /** + * Returns a UTF-8 encoded std::string version of the String instance. Uses the conversion + * stored in the output buffer when possible. + */ + std::string toString(); /** * Returns the number Unicode codepoints in the String. @@ -143,6 +164,17 @@ private: * The underlying UTF-32 data. */ std::u32string _data; + + /** + * A buffer for storing the result of the UTF-32 to UTF-8 conversion. + */ + std::string _outputBuf; + + /** + * A bool flag that is set to true when toString() will require that the UTF-32 to UTF-8 + * conversion be applied again. + */ + bool _needsOutputConversion; }; } // namespace unicode |