diff options
Diffstat (limited to 'src/mongo')
-rw-r--r-- | src/mongo/db/fts/fts_unicode_tokenizer.cpp | 17 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_unicode_tokenizer.h | 3 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/gen_delimiter_list.py | 7 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/string.cpp | 87 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/string.h | 38 |
5 files changed, 120 insertions, 32 deletions
diff --git a/src/mongo/db/fts/fts_unicode_tokenizer.cpp b/src/mongo/db/fts/fts_unicode_tokenizer.cpp index 8cdce180dea..a5460448505 100644 --- a/src/mongo/db/fts/fts_unicode_tokenizer.cpp +++ b/src/mongo/db/fts/fts_unicode_tokenizer.cpp @@ -62,7 +62,7 @@ UnicodeFTSTokenizer::UnicodeFTSTokenizer(const FTSLanguage* language) void UnicodeFTSTokenizer::reset(StringData document, Options options) { _options = options; _pos = 0; - _document = unicode::String(document); + _document.resetData(document); // Skip any leading delimiters (and handle the case where the document is entirely delimiters). _skipDelimiters(); @@ -81,29 +81,30 @@ bool UnicodeFTSTokenizer::moveNext() { (!unicode::codepointIsDelimiter(_document[_pos], _delimListLanguage))) { ++_pos; } - unicode::String token = _document.substr(start, _pos - start); + _document.substrToBuf(start, _pos - start, _tokenBuf); // Skip the delimiters before the next token. _skipDelimiters(); // Stop words are case-sensitive and diacritic sensitive, so we need them to be lower cased // but with diacritics not removed to check against the stop word list. - unicode::String word = token.toLower(_caseFoldMode); + _tokenBuf.toLowerToBuf(_caseFoldMode, _wordBuf); - if ((_options & kFilterStopWords) && _stopWords->isStopWord(word.toString())) { + if ((_options & kFilterStopWords) && _stopWords->isStopWord(_wordBuf.toString())) { continue; } if (_options & kGenerateCaseSensitiveTokens) { - word = token; + _tokenBuf.copyToBuf(_wordBuf); } // The stemmer is diacritic sensitive, so stem the word before removing diacritics. - _stem = _stemmer.stem(word.toString()); + _stem = _stemmer.stem(_wordBuf.toString()); if (!(_options & kGenerateDiacriticSensitiveTokens)) { - token.resetData(_stem); - _stem = token.removeDiacritics().toString(); + _tokenBuf.resetData(_stem); + _tokenBuf.removeDiacriticsToBuf(_wordBuf); + _stem = _wordBuf.toString(); } return true; diff --git a/src/mongo/db/fts/fts_unicode_tokenizer.h b/src/mongo/db/fts/fts_unicode_tokenizer.h index 0312ffc300b..c6feeca9725 100644 --- a/src/mongo/db/fts/fts_unicode_tokenizer.h +++ b/src/mongo/db/fts/fts_unicode_tokenizer.h @@ -83,6 +83,9 @@ private: unicode::String _document; size_t _pos; + unicode::String _tokenBuf; + unicode::String _wordBuf; + Options _options; std::string _stem; diff --git a/src/mongo/db/fts/unicode/gen_delimiter_list.py b/src/mongo/db/fts/unicode/gen_delimiter_list.py index 52b79544c6b..5b678400e19 100644 --- a/src/mongo/db/fts/unicode/gen_delimiter_list.py +++ b/src/mongo/db/fts/unicode/gen_delimiter_list.py @@ -65,6 +65,13 @@ DelimiterListLanguage lang) { return false; } + // Most characters are latin letters, so filter those out first. + if (codepoint >= 'A' && codepoint <= 'Z') { + return false; + } else if (codepoint >= 'a' && codepoint <= 'z') { + return false; + } + switch (codepoint) {\n""") for delim in sorted(delim_codepoints): diff --git a/src/mongo/db/fts/unicode/string.cpp b/src/mongo/db/fts/unicode/string.cpp index 9f29749edbc..9c18a776787 100644 --- a/src/mongo/db/fts/unicode/string.cpp +++ b/src/mongo/db/fts/unicode/string.cpp @@ -42,10 +42,16 @@ using linenoise_utf8::copyString8to32; using std::u32string; String::String(const StringData utf8_src) { + // Reserve space for underlying buffers to prevent excessive reallocations. + _outputBuf.reserve(utf8_src.size() * 4); + _data.reserve(utf8_src.size() * 4); + + // Convert UTF-8 input to UTF-32 data. setData(utf8_src); } void String::resetData(const StringData utf8_src) { + // Convert UTF-8 input to UTF-32 data. setData(utf8_src); } @@ -70,20 +76,28 @@ void String::setData(const StringData utf8_src) { // Resize _data so it is only as big as what it contains. _data.resize(resultSize); + _needsOutputConversion = true; } -String::String(u32string&& src) : _data(std::move(src)) {} - -std::string String::toString() const { - // output is the target, resize it so that it's guaranteed to fit all of the input characters, - // plus a null character if there isn't one. - std::string output(_data.size() * 4 + 1, '\0'); - size_t resultSize = - copyString32to8(reinterpret_cast<unsigned char*>(&output[0]), &_data[0], output.size()); +String::String(u32string&& src) : _data(std::move(src)), _needsOutputConversion(true) { + // Reserve space for underlying buffers to prevent excessive reallocations. + _outputBuf.reserve(src.size() * 4); + _data.reserve(src.size() * 4); +} - // Resize output so it is only as large as what it contains. - output.resize(resultSize); - return output; +std::string String::toString() { + // _outputBuf is the target, resize it so that it's guaranteed to fit all of the input + // characters, plus a null character if there isn't one. + if (_needsOutputConversion) { + _outputBuf.resize(_data.size() * 4 + 1); + size_t resultSize = copyString32to8( + reinterpret_cast<unsigned char*>(&_outputBuf[0]), &_data[0], _outputBuf.size()); + + // Resize output so it is only as large as what it contains. + _outputBuf.resize(resultSize); + _needsOutputConversion = false; + } + return _outputBuf; } size_t String::size() const { @@ -95,30 +109,61 @@ const char32_t& String::operator[](int i) const { } String String::substr(size_t pos, size_t len) const { - return String(_data.substr(pos, len)); + unicode::String buf; + substrToBuf(pos, len, buf); + return buf; } String String::toLower(CaseFoldMode mode) const { - u32string newdata(_data.size(), 0); + unicode::String buf; + toLowerToBuf(mode, buf); + return buf; +} + +String String::removeDiacritics() const { + unicode::String buf; + removeDiacriticsToBuf(buf); + return buf; +} + +void String::copyToBuf(String& buffer) const { + buffer._data = _data; + buffer._data.resize(_data.size()); auto index = 0; for (auto codepoint : _data) { - newdata[index++] = codepointToLower(codepoint, mode); + buffer._data[index++] = codepoint; } + buffer._needsOutputConversion = true; +} - return String(std::move(newdata)); +void String::substrToBuf(size_t pos, size_t len, String& buffer) const { + buffer._data.resize(len + 1); + for (size_t index = 0, src_pos = pos; index < len;) { + buffer._data[index++] = _data[src_pos++]; + } + buffer._data[len] = '\0'; + buffer._needsOutputConversion = true; } -String String::removeDiacritics() const { - u32string newdata(_data.size(), 0); +void String::toLowerToBuf(CaseFoldMode mode, String& buffer) const { + buffer._data.resize(_data.size()); + auto index = 0; + for (auto codepoint : _data) { + buffer._data[index++] = codepointToLower(codepoint, mode); + } + buffer._needsOutputConversion = true; +} + +void String::removeDiacriticsToBuf(String& buffer) const { + buffer._data.resize(_data.size()); auto index = 0; for (auto codepoint : _data) { if (!codepointIsDiacritic(codepoint)) { - newdata[index++] = codepointRemoveDiacritics(codepoint); + buffer._data[index++] = codepointRemoveDiacritics(codepoint); } } - - newdata.resize(index); - return String(std::move(newdata)); + buffer._data.resize(index); + buffer._needsOutputConversion = true; } bool String::substrMatch(const String& str, diff --git a/src/mongo/db/fts/unicode/string.h b/src/mongo/db/fts/unicode/string.h index ddfa6f93870..c3355ee4f25 100644 --- a/src/mongo/db/fts/unicode/string.h +++ b/src/mongo/db/fts/unicode/string.h @@ -67,7 +67,7 @@ public: void resetData(const StringData utf8_src); /** - * Return a lowercased version of the String instance using the Unicode data in u_data.h. + * Return a lowercased version of the String instance. */ String toLower(CaseFoldMode mode = CaseFoldMode::kNormal) const; @@ -82,9 +82,30 @@ public: String substr(size_t begin, size_t end) const; /** - * Returns a UTF-8 encoded std::string version of the String instance. + * Copies the current String to another String. */ - std::string toString() const; + void copyToBuf(String& buffer) const; + + /** + * Takes a substring of the current String and puts it in another String. + */ + void substrToBuf(size_t pos, size_t len, String& buffer) const; + + /** + * Lowercases the current String and stores the result in another String. + */ + void toLowerToBuf(CaseFoldMode mode, String& buffer) const; + + /** + * Removes diacritics from the current String and stores the result in another String. + */ + void removeDiacriticsToBuf(String& buffer) const; + + /** + * Returns a UTF-8 encoded std::string version of the String instance. Uses the conversion + * stored in the output buffer when possible. + */ + std::string toString(); /** * Returns the number Unicode codepoints in the String. @@ -143,6 +164,17 @@ private: * The underlying UTF-32 data. */ std::u32string _data; + + /** + * A buffer for storing the result of the UTF-32 to UTF-8 conversion. + */ + std::string _outputBuf; + + /** + * A bool flag that is set to true when toString() will require that the UTF-32 to UTF-8 + * conversion be applied again. + */ + bool _needsOutputConversion; }; } // namespace unicode |