diff options
author | Mathias Stearn <mathias@10gen.com> | 2016-03-02 18:31:44 -0500 |
---|---|---|
committer | Mathias Stearn <mathias@10gen.com> | 2016-03-11 08:55:40 -0500 |
commit | 4b10e50494175df2b1ed8fc4f8e7f8c6ca6f06d5 (patch) | |
tree | 5c799a4d1b0b565854687ef4f25f8f3308afd462 /src/mongo/db/fts | |
parent | 72aab77138463d96494389bc538c13395c34a2d3 (diff) | |
download | mongo-4b10e50494175df2b1ed8fc4f8e7f8c6ca6f06d5.tar.gz |
SERVER-19936 Optimize UnicodeFTSTokenizer
Diffstat (limited to 'src/mongo/db/fts')
-rw-r--r-- | src/mongo/db/fts/fts_basic_tokenizer.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_spec_legacy.cpp | 2 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_unicode_tokenizer.cpp | 43 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_unicode_tokenizer.h | 13 | ||||
-rw-r--r-- | src/mongo/db/fts/stemmer.cpp | 9 | ||||
-rw-r--r-- | src/mongo/db/fts/stemmer.h | 11 | ||||
-rw-r--r-- | src/mongo/db/fts/stop_words.cpp | 2 | ||||
-rw-r--r-- | src/mongo/db/fts/stop_words.h | 8 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/gen_delimiter_list.py | 33 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/string.cpp | 143 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/string.h | 92 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/string_test.cpp | 58 |
12 files changed, 191 insertions, 229 deletions
diff --git a/src/mongo/db/fts/fts_basic_tokenizer.cpp b/src/mongo/db/fts/fts_basic_tokenizer.cpp index d8cb9874772..c43376b23ea 100644 --- a/src/mongo/db/fts/fts_basic_tokenizer.cpp +++ b/src/mongo/db/fts/fts_basic_tokenizer.cpp @@ -68,9 +68,7 @@ bool BasicFTSTokenizer::moveNext() { continue; } - string word = token.data.toString(); - - word = tolowerString(token.data); + string word = tolowerString(token.data); // Stop words are case-sensitive so we need them to be lower cased to check // against the stop word list @@ -82,7 +80,7 @@ bool BasicFTSTokenizer::moveNext() { word = token.data.toString(); } - _stem = _stemmer.stem(word); + _stem = _stemmer.stem(word).toString(); return true; } } diff --git a/src/mongo/db/fts/fts_spec_legacy.cpp b/src/mongo/db/fts/fts_spec_legacy.cpp index 8d36b51ca61..b7867dba520 100644 --- a/src/mongo/db/fts/fts_spec_legacy.cpp +++ b/src/mongo/db/fts/fts_spec_legacy.cpp @@ -81,7 +81,7 @@ void FTSSpec::_scoreStringV1(const Tools& tools, string term = tolowerString(t.data); if (tools.stopwords->isStopWord(term)) continue; - term = tools.stemmer->stem(term); + term = tools.stemmer->stem(term).toString(); ScoreHelperStruct& data = terms[term]; diff --git a/src/mongo/db/fts/fts_unicode_tokenizer.cpp b/src/mongo/db/fts/fts_unicode_tokenizer.cpp index 036a7703fc3..8648523e4af 100644 --- a/src/mongo/db/fts/fts_unicode_tokenizer.cpp +++ b/src/mongo/db/fts/fts_unicode_tokenizer.cpp @@ -45,24 +45,19 @@ namespace fts { using std::string; UnicodeFTSTokenizer::UnicodeFTSTokenizer(const FTSLanguage* language) - : _language(language), _stemmer(language), _stopWords(StopWords::getStopWords(language)) { - if (_language->str() == "english") { - _delimListLanguage = unicode::DelimiterListLanguage::kEnglish; - } else { - _delimListLanguage = unicode::DelimiterListLanguage::kNotEnglish; - } - - if (_language->str() == "turkish") { - _caseFoldMode = unicode::CaseFoldMode::kTurkish; - } else { - _caseFoldMode = unicode::CaseFoldMode::kNormal; - } -} + : _language(language), + _stemmer(language), + _stopWords(StopWords::getStopWords(language)), + _delimListLanguage(_language->str() == "english" + ? unicode::DelimiterListLanguage::kEnglish + : unicode::DelimiterListLanguage::kNotEnglish), + _caseFoldMode(_language->str() == "turkish" ? unicode::CaseFoldMode::kTurkish + : unicode::CaseFoldMode::kNormal) {} void UnicodeFTSTokenizer::reset(StringData document, Options options) { _options = options; _pos = 0; - _document.resetData(document); + _document.resetData(document); // Validates that document is valid UTF8. // Skip any leading delimiters (and handle the case where the document is entirely delimiters). _skipDelimiters(); @@ -71,7 +66,7 @@ void UnicodeFTSTokenizer::reset(StringData document, Options options) { bool UnicodeFTSTokenizer::moveNext() { while (true) { if (_pos >= _document.size()) { - _stem = ""; + _word = ""; return false; } @@ -81,30 +76,30 @@ bool UnicodeFTSTokenizer::moveNext() { (!unicode::codepointIsDelimiter(_document[_pos], _delimListLanguage))) { ++_pos; } - _document.substrToBuf(start, _pos - start, _tokenBuf); + const size_t len = _pos - start; // Skip the delimiters before the next token. _skipDelimiters(); // Stop words are case-sensitive and diacritic sensitive, so we need them to be lower cased // but with diacritics not removed to check against the stop word list. - _tokenBuf.toLowerToBuf(_caseFoldMode, _wordBuf); + _word = _document.toLowerToBuf(&_wordBuf, _caseFoldMode, start, len); - if ((_options & kFilterStopWords) && _stopWords->isStopWord(_wordBuf.toString())) { + if ((_options & kFilterStopWords) && _stopWords->isStopWord(_word)) { continue; } if (_options & kGenerateCaseSensitiveTokens) { - _tokenBuf.copyToBuf(_wordBuf); + _word = _document.substrToBuf(&_wordBuf, start, len); } // The stemmer is diacritic sensitive, so stem the word before removing diacritics. - _stem = _stemmer.stem(_wordBuf.toString()); + _word = _stemmer.stem(_word); if (!(_options & kGenerateDiacriticSensitiveTokens)) { - _tokenBuf.resetData(_stem); - _tokenBuf.removeDiacriticsToBuf(_wordBuf); - _stem = _wordBuf.toString(); + // Can't use _wordbuf for output here because our input _word may point into it. + _word = unicode::String::caseFoldAndStripDiacritics( + &_finalBuf, _word, unicode::String::kCaseSensitive, _caseFoldMode); } return true; @@ -112,7 +107,7 @@ bool UnicodeFTSTokenizer::moveNext() { } StringData UnicodeFTSTokenizer::get() const { - return _stem; + return _word; } void UnicodeFTSTokenizer::_skipDelimiters() { diff --git a/src/mongo/db/fts/fts_unicode_tokenizer.h b/src/mongo/db/fts/fts_unicode_tokenizer.h index c6feeca9725..2b8c54e3e88 100644 --- a/src/mongo/db/fts/fts_unicode_tokenizer.h +++ b/src/mongo/db/fts/fts_unicode_tokenizer.h @@ -73,22 +73,19 @@ private: */ void _skipDelimiters(); - unicode::DelimiterListLanguage _delimListLanguage; - unicode::CaseFoldMode _caseFoldMode; - const FTSLanguage* const _language; const Stemmer _stemmer; const StopWords* const _stopWords; + const unicode::DelimiterListLanguage _delimListLanguage; + const unicode::CaseFoldMode _caseFoldMode; unicode::String _document; size_t _pos; - - unicode::String _tokenBuf; - unicode::String _wordBuf; - + StringData _word; Options _options; - std::string _stem; + StackBufBuilder _wordBuf; + StackBufBuilder _finalBuf; }; } // namespace fts diff --git a/src/mongo/db/fts/stemmer.cpp b/src/mongo/db/fts/stemmer.cpp index 07d17c050eb..19995baae4e 100644 --- a/src/mongo/db/fts/stemmer.cpp +++ b/src/mongo/db/fts/stemmer.cpp @@ -29,7 +29,6 @@ */ #include <cstdlib> -#include <string> #include "mongo/db/fts/stemmer.h" #include "mongo/util/mongoutils/str.h" @@ -38,8 +37,6 @@ namespace mongo { namespace fts { -using std::string; - Stemmer::Stemmer(const FTSLanguage* language) { _stemmer = NULL; if (language->str() != "none") @@ -53,9 +50,9 @@ Stemmer::~Stemmer() { } } -string Stemmer::stem(StringData word) const { +StringData Stemmer::stem(StringData word) const { if (!_stemmer) - return word.toString(); + return word; const sb_symbol* sb_sym = sb_stemmer_stem(_stemmer, (const sb_symbol*)word.rawData(), word.size()); @@ -65,7 +62,7 @@ string Stemmer::stem(StringData word) const { invariant(false); } - return string((const char*)(sb_sym), sb_stemmer_length(_stemmer)); + return StringData((const char*)(sb_sym), sb_stemmer_length(_stemmer)); } } } diff --git a/src/mongo/db/fts/stemmer.h b/src/mongo/db/fts/stemmer.h index 59261bfb6a0..80bfdc2faf9 100644 --- a/src/mongo/db/fts/stemmer.h +++ b/src/mongo/db/fts/stemmer.h @@ -31,8 +31,6 @@ #pragma once -#include <string> - #include "mongo/base/string_data.h" #include "mongo/db/fts/fts_language.h" #include "third_party/libstemmer_c/include/libstemmer.h" @@ -53,7 +51,14 @@ public: Stemmer(const FTSLanguage* language); ~Stemmer(); - std::string stem(StringData word) const; + /** + * Stems an input word. + * + * The returned StringData is valid until the next call to any method on this object. Since the + * input may be returned unmodified, the output's lifetime may also expire when the input's + * does. + */ + StringData stem(StringData word) const; private: struct sb_stemmer* _stemmer; diff --git a/src/mongo/db/fts/stop_words.cpp b/src/mongo/db/fts/stop_words.cpp index 0a44eaf25ff..6e108db70fc 100644 --- a/src/mongo/db/fts/stop_words.cpp +++ b/src/mongo/db/fts/stop_words.cpp @@ -52,7 +52,7 @@ StopWords::StopWords() {} StopWords::StopWords(const std::set<std::string>& words) { for (std::set<std::string>::const_iterator i = words.begin(); i != words.end(); ++i) - _words.insert(*i); + _words[*i] = true; } const StopWords* StopWords::getStopWords(const FTSLanguage* language) { diff --git a/src/mongo/db/fts/stop_words.h b/src/mongo/db/fts/stop_words.h index eebc11c012a..f80b638510d 100644 --- a/src/mongo/db/fts/stop_words.h +++ b/src/mongo/db/fts/stop_words.h @@ -35,7 +35,7 @@ #include <string> #include "mongo/db/fts/fts_language.h" -#include "mongo/platform/unordered_set.h" +#include "mongo/util/string_map.h" namespace mongo { @@ -48,8 +48,8 @@ public: StopWords(); StopWords(const std::set<std::string>& words); - bool isStopWord(const std::string& word) const { - return _words.count(word) > 0; + bool isStopWord(StringData word) const { + return _words.find(word) != _words.end(); } size_t numStopWords() const { @@ -59,7 +59,7 @@ public: static const StopWords* getStopWords(const FTSLanguage* language); private: - unordered_set<std::string> _words; + StringMap<bool> _words; // Used as a set. The values have no meaning. }; } } diff --git a/src/mongo/db/fts/unicode/gen_delimiter_list.py b/src/mongo/db/fts/unicode/gen_delimiter_list.py index d8a788c74ff..3c3131a8f6b 100644 --- a/src/mongo/db/fts/unicode/gen_delimiter_list.py +++ b/src/mongo/db/fts/unicode/gen_delimiter_list.py @@ -58,23 +58,32 @@ def generate(unicode_proplist_file, target): # As of Unicode 8.0.0, all of the delimiters we used for text index # version 2 are also in the list. - - out.write("""bool codepointIsDelimiter(char32_t codepoint, \ -DelimiterListLanguage lang) { - if (lang == DelimiterListLanguage::kEnglish && codepoint == '\\'') { - return false; - } - - // Most characters are latin letters, so filter those out first. - if (codepoint >= 'A' && codepoint <= 'Z') { - return false; - } else if (codepoint >= 'a' && codepoint <= 'z') { - return false; + out.write("static const bool englishAsciiDelimiters[128] = {\n") + for cp in range(0x80): + if cp == ord("'"): + out.write(" 0, // ' special case\n") + else: + out.write(" %d, // 0x%x\n" % (cp in delim_codepoints, cp)) + out.write("};\n") + + out.write("static const bool nonEnglishAsciiDelimiters[128] = {\n") + for cp in range(0x80): + out.write(" %d, // 0x%x\n" % (cp in delim_codepoints, cp)) + out.write("};\n") + + out.write("""bool codepointIsDelimiter(char32_t codepoint, DelimiterListLanguage lang) { + if (codepoint <= 0x7f) { + if (lang == DelimiterListLanguage::kEnglish) { + return englishAsciiDelimiters[codepoint]; + } + return nonEnglishAsciiDelimiters[codepoint]; } switch (codepoint) {\n""") for delim in sorted(delim_codepoints): + if delim <= 0x7f: # ascii codepoints handled in lists above. + continue out.write("\ case " + str(hex(delim)) + ": return true;\n") diff --git a/src/mongo/db/fts/unicode/string.cpp b/src/mongo/db/fts/unicode/string.cpp index 3218f04dbf7..10737acc3ed 100644 --- a/src/mongo/db/fts/unicode/string.cpp +++ b/src/mongo/db/fts/unicode/string.cpp @@ -39,6 +39,28 @@ namespace mongo { namespace unicode { +namespace { +template <typename OutputIterator> +inline void appendUtf8Codepoint(char32_t codepoint, OutputIterator* outputIt) { + if (codepoint <= 0x7f /* max 1-byte codepoint */) { + *(*outputIt)++ = (codepoint); + } else if (codepoint <= 0x7ff /* max 2-byte codepoint*/) { + *(*outputIt)++ = ((codepoint >> (6 * 1)) | 0xc0); // 2 leading 1s. + *(*outputIt)++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80); + } else if (codepoint <= 0xffff /* max 3-byte codepoint*/) { + *(*outputIt)++ = ((codepoint >> (6 * 2)) | 0xe0); // 3 leading 1s. + *(*outputIt)++ = (((codepoint >> (6 * 1)) & 0x3f) | 0x80); + *(*outputIt)++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80); + } else { + uassert(ErrorCodes::BadValue, "text contains invalid UTF-8", codepoint <= 0x10FFFF); + *(*outputIt)++ = ((codepoint >> (6 * 3)) | 0xf0); // 4 leading 1s. + *(*outputIt)++ = (((codepoint >> (6 * 2)) & 0x3f) | 0x80); + *(*outputIt)++ = (((codepoint >> (6 * 1)) & 0x3f) | 0x80); + *(*outputIt)++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80); + } +} +} + using linenoise_utf8::copyString32to8; using linenoise_utf8::copyString8to32; @@ -93,74 +115,45 @@ std::string String::toString() { return _outputBuf; } -String String::substr(size_t pos, size_t len) const { - unicode::String buf; - substrToBuf(pos, len, buf); - return buf; -} - -String String::toLower(CaseFoldMode mode) const { - unicode::String buf; - toLowerToBuf(mode, buf); - return buf; -} - -String String::removeDiacritics() const { - unicode::String buf; - removeDiacriticsToBuf(buf); - return buf; -} - -void String::copyToBuf(String& buffer) const { - buffer._data = _data; - buffer._data.resize(_data.size()); - auto index = 0; - for (auto codepoint : _data) { - buffer._data[index++] = codepoint; +template <typename Func> +StringData String::substrToBufWithTransform(StackBufBuilder* buffer, + size_t pos, + size_t len, + Func func) const { + pos = std::min(pos, _data.size()); + len = std::min(len, _data.size() - pos); + + buffer->reset(); + auto outputIt = buffer->skip(len * 4); // Reserve room for worst-case expansion. + auto inputIt = _data.begin() + pos; + for (size_t i = 0; i < len; i++) { + appendUtf8Codepoint(func(*inputIt++), &outputIt); } - buffer._needsOutputConversion = true; + buffer->setlen(outputIt - buffer->buf()); + return {buffer->buf(), size_t(buffer->len())}; } -void String::substrToBuf(size_t pos, size_t len, String& buffer) const { - buffer._data.resize(len + 1); - for (size_t index = 0, src_pos = pos; index < len;) { - buffer._data[index++] = _data[src_pos++]; - } - buffer._data[len] = '\0'; - buffer._needsOutputConversion = true; +StringData String::substrToBuf(StackBufBuilder* buffer, size_t pos, size_t len) const { + const auto identityFunc = [](char32_t ch) { return ch; }; + return substrToBufWithTransform(buffer, pos, len, identityFunc); } -void String::toLowerToBuf(CaseFoldMode mode, String& buffer) const { - buffer._data.resize(_data.size()); - auto outIt = buffer._data.begin(); - for (auto codepoint : _data) { - *outIt++ = codepointToLower(codepoint, mode); - } - buffer._needsOutputConversion = true; +StringData String::toLowerToBuf(StackBufBuilder* buffer, + CaseFoldMode mode, + size_t pos, + size_t len) const { + const auto toLower = [mode](char32_t ch) { return codepointToLower(ch, mode); }; + return substrToBufWithTransform(buffer, pos, len, toLower); } -void String::removeDiacriticsToBuf(String& buffer) const { - buffer._data.resize(_data.size()); - auto outIt = buffer._data.begin(); - for (auto codepoint : _data) { - if (codepoint <= 0x7f) { - // ASCII only has two diacritics so they are hard-coded here. - if (codepoint != '^' && codepoint != '`') { - *outIt++ = codepoint; - } - } else if (auto clean = codepointRemoveDiacritics(codepoint)) { - *outIt++ = clean; - } else { - // codepoint was a pure diacritic mark, so skip it. - } - } - buffer._data.resize(outIt - buffer._data.begin()); - buffer._needsOutputConversion = true; -} -String::MaybeOwnedStringData String::caseFoldAndStripDiacritics(StringData utf8, - SubstrMatchOptions options, - CaseFoldMode mode) { +StringData String::caseFoldAndStripDiacritics(StackBufBuilder* buffer, + StringData utf8, + SubstrMatchOptions options, + CaseFoldMode mode) { + // This fires if your input buffer the same as your output buffer. + invariant(buffer->buf() != utf8.rawData()); + if ((options & kCaseSensitive) && (options & kDiacriticSensitive)) { // No transformation needed. Just return the input data unmodified. return utf8; @@ -170,8 +163,8 @@ String::MaybeOwnedStringData String::caseFoldAndStripDiacritics(StringData utf8, // and casefolding. Proof: the only case where 1 byte goes to >1 is 'I' in Turkish going to 2 // bytes. The biggest codepoint is 4 bytes which is also 2x 2 bytes. This holds as long as we // don't map a single code point to more than one. - std::unique_ptr<char[]> buffer(new char[utf8.size() * 2]); - auto outputIt = buffer.get(); + buffer->reset(); + auto outputIt = buffer->skip(utf8.size() * 2); for (auto inputIt = utf8.begin(), endIt = utf8.end(); inputIt != endIt;) { #ifdef MONGO_HAVE_FAST_BYTE_VECTOR @@ -258,25 +251,11 @@ String::MaybeOwnedStringData String::caseFoldAndStripDiacritics(StringData utf8, } } - // Back to utf-8. - if (codepoint <= 0x7f /* max 1-byte codepoint */) { - *outputIt++ = (codepoint); - } else if (codepoint <= 0x7ff /* max 2-byte codepoint*/) { - *outputIt++ = ((codepoint >> (6 * 1)) | 0xc0); // 2 leading 1s. - *outputIt++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80); - } else if (codepoint <= 0xffff /* max 3-byte codepoint*/) { - *outputIt++ = ((codepoint >> (6 * 2)) | 0xe0); // 3 leading 1s. - *outputIt++ = (((codepoint >> (6 * 1)) & 0x3f) | 0x80); - *outputIt++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80); - } else { - *outputIt++ = ((codepoint >> (6 * 3)) | 0xf0); // 4 leading 1s. - *outputIt++ = (((codepoint >> (6 * 2)) & 0x3f) | 0x80); - *outputIt++ = (((codepoint >> (6 * 1)) & 0x3f) | 0x80); - *outputIt++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80); - } + appendUtf8Codepoint(codepoint, &outputIt); } - return {std::move(buffer), outputIt}; + buffer->setlen(outputIt - buffer->buf()); + return {buffer->buf(), size_t(buffer->len())}; } bool String::substrMatch(const std::string& str, @@ -288,8 +267,10 @@ bool String::substrMatch(const std::string& str, options &= ~kCaseSensitive; } - auto haystack = caseFoldAndStripDiacritics(str, options, cfMode); - auto needle = caseFoldAndStripDiacritics(find, options, cfMode); + StackBufBuilder haystackBuf; + StackBufBuilder needleBuf; + auto haystack = caseFoldAndStripDiacritics(&haystackBuf, str, options, cfMode); + auto needle = caseFoldAndStripDiacritics(&needleBuf, find, options, cfMode); // Case sensitive and diacritic sensitive. return boost::algorithm::boyer_moore_search( diff --git a/src/mongo/db/fts/unicode/string.h b/src/mongo/db/fts/unicode/string.h index 64a0d89918b..dac83ed3b24 100644 --- a/src/mongo/db/fts/unicode/string.h +++ b/src/mongo/db/fts/unicode/string.h @@ -33,6 +33,7 @@ #include <string> #include "mongo/base/string_data.h" +#include "mongo/bson/util/builder.h" #include "mongo/db/fts/unicode/codepoints.h" namespace mongo { @@ -45,34 +46,6 @@ namespace unicode { */ class String { public: - /** - * A StringData that may own its own buffer. - */ - class MaybeOwnedStringData : public StringData { - public: - /** - * Makes an empty, unowned string. - */ - MaybeOwnedStringData() = default; - - /** - * Makes an owned string. - */ - MaybeOwnedStringData(std::unique_ptr<char[]>&& buffer, const char* endIt) - : StringData(buffer.get(), endIt - buffer.get()), _buffer(std::move(buffer)) {} - - /** - * Makes an unowned string. - */ - /*implicit*/ MaybeOwnedStringData(StringData str) : StringData(str) {} - MaybeOwnedStringData& operator=(StringData str) { - return (*this = MaybeOwnedStringData(str)); - } - - private: - std::unique_ptr<char[]> _buffer; - }; - String() = default; #if defined(_MSC_VER) && _MSC_VER < 1900 @@ -96,39 +69,19 @@ public: void resetData(const StringData utf8_src); /** - * Return a lowercased version of the String instance. - */ - String toLower(CaseFoldMode mode = CaseFoldMode::kNormal) const; - - /** - * Returns a version of the String instance with diacritics and combining marks removed. - */ - String removeDiacritics() const; - - /** - * Returns a substring of the String instance, using the same semantics as std::string::substr. - */ - String substr(size_t begin, size_t end) const; - - /** - * Copies the current String to another String. - */ - void copyToBuf(String& buffer) const; - - /** * Takes a substring of the current String and puts it in another String. + * Overwrites buffer's previous contents rather than appending. */ - void substrToBuf(size_t pos, size_t len, String& buffer) const; - - /** - * Lowercases the current String and stores the result in another String. - */ - void toLowerToBuf(CaseFoldMode mode, String& buffer) const; + StringData substrToBuf(StackBufBuilder* buffer, size_t pos, size_t len) const; /** - * Removes diacritics from the current String and stores the result in another String. + * Lowercases a substring of the current String and stores the UTF8 result in buffer. + * Overwrites buffer's previous contents rather than appending. */ - void removeDiacriticsToBuf(String& buffer) const; + StringData toLowerToBuf(StackBufBuilder* buffer, + CaseFoldMode mode, + size_t offset = 0, + size_t len = std::string::npos) const; /** * Returns a UTF-8 encoded std::string version of the String instance. Uses the conversion @@ -183,10 +136,20 @@ public: /** * Strips diacritics and case-folds the utf8 input string, as needed to support options. - */ - static MaybeOwnedStringData caseFoldAndStripDiacritics(StringData utf8, - SubstrMatchOptions options, - CaseFoldMode mode); + * + * The options field specifies what operations to *skip*, so kCaseSensitive means to skip case + * folding and kDiacriticSensitive means to skip diacritic striping. If both flags are + * specified, the input utf8 StringData is returned directly without any processing or copying. + * + * If processing is performed, the returned StringData will be placed in buffer. buffer's + * contents (if any) will be replaced. Since we may return the input unmodified the returned + * StringData's lifetime is the shorter of the input utf8 and the next modification to buffer. + * The input utf8 must not point into buffer. + */ + static StringData caseFoldAndStripDiacritics(StackBufBuilder* buffer, + StringData utf8, + SubstrMatchOptions options, + CaseFoldMode mode); private: /** @@ -195,6 +158,15 @@ private: void setData(const StringData utf8_src); /** + * Unified implementation of substrToBuf and toLowerToBuf. + */ + template <typename Func> + StringData substrToBufWithTransform(StackBufBuilder* buffer, + size_t pos, + size_t len, + Func transform) const; + + /** * The underlying UTF-32 data. */ std::u32string _data; diff --git a/src/mongo/db/fts/unicode/string_test.cpp b/src/mongo/db/fts/unicode/string_test.cpp index d627120e9e7..e67228f76de 100644 --- a/src/mongo/db/fts/unicode/string_test.cpp +++ b/src/mongo/db/fts/unicode/string_test.cpp @@ -61,19 +61,30 @@ auto kNormal = CaseFoldMode::kNormal; // Macro to preserve line numbers and arguments in error messages. -#define TEST_CASE_FOLD_AND_STRIP_DIACRITICS(expected, input, options, caseFoldMode) \ - ASSERT_EQ(expected, String::caseFoldAndStripDiacritics(input, options, caseFoldMode)); \ - ASSERT_EQ(expected + filler, \ - String::caseFoldAndStripDiacritics(input + filler, options, caseFoldMode)) +#define TEST_CASE_FOLD_AND_STRIP_DIACRITICS(expected, input, options, caseFoldMode) \ + do { \ + StackBufBuilder buf; \ + ASSERT_EQ(expected, \ + String::caseFoldAndStripDiacritics(&buf, input, options, caseFoldMode)); \ + ASSERT_EQ( \ + expected + filler, \ + String::caseFoldAndStripDiacritics(&buf, input + filler, options, caseFoldMode)); \ + } while (0) + +TEST(UnicodeString, SubstrTest) { + StackBufBuilder buf; + String indexes("01234"); + ASSERT_EQ("123", indexes.substrToBuf(&buf, 1, 3)); + ASSERT_EQ("4", indexes.substrToBuf(&buf, 4, 3)); // len too long. + ASSERT_EQ("", indexes.substrToBuf(&buf, 6, 3)); // pos past end. + ASSERT_EQ("", indexes.substrToBuf(&buf, 1, 0)); // len == 0. +} TEST(UnicodeString, RemoveDiacritics) { // Test all ascii chars. for (unsigned char ch = 0; ch <= 0x7F; ch++) { const auto input = std::string(1, ch); const auto output = codepointIsDiacritic(ch) ? std::string() : std::string(1, ch); - if (ch) { // String's constructor doesn't handle embedded NUL bytes. - ASSERT_EQUALS(output, String(input).removeDiacritics().toString()); - } TEST_CASE_FOLD_AND_STRIP_DIACRITICS(output, input, kCaseSensitive, kNormal); } @@ -83,21 +94,20 @@ TEST(UnicodeString, RemoveDiacritics) { // NFD Normalized Text ("Café"). const char test2[] = {'C', 'a', 'f', 'e', static_cast<char>(0xcc), static_cast<char>(0x81), 0}; - ASSERT_EQUALS(UTF8("¿CUANTOS ANOS TIENES TU?"), String(test1).removeDiacritics().toString()); - ASSERT_EQUALS(UTF8("Cafe"), String(test2).removeDiacritics().toString()); - TEST_CASE_FOLD_AND_STRIP_DIACRITICS( UTF8("¿CUANTOS ANOS TIENES TU?"), test1, kCaseSensitive, kNormal); TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("Cafe"), test2, kCaseSensitive, kNormal); } TEST(UnicodeString, CaseFolding) { + StackBufBuilder buf; + // Test all ascii chars. for (unsigned char ch = 0; ch <= 0x7F; ch++) { const auto upper = std::string(1, ch); const auto lower = std::string(1, std::tolower(ch)); if (ch) { // String's constructor doesn't handle embedded NUL bytes. - ASSERT_EQUALS(lower, String(upper).toLower().toString()); + ASSERT_EQUALS(lower, String(upper).toLowerToBuf(&buf, kNormal)); } TEST_CASE_FOLD_AND_STRIP_DIACRITICS(lower, upper, kDiacriticSensitive, kNormal); } @@ -105,8 +115,8 @@ TEST(UnicodeString, CaseFolding) { const char test1[] = UTF8("СКОЛЬКО ТЕБЕ ЛЕТ?"); const char test2[] = UTF8("¿CUÁNTOS AÑOS TIENES TÚ?"); - ASSERT_EQUALS(UTF8("сколько тебе лет?"), String(test1).toLower().toString()); - ASSERT_EQUALS(UTF8("¿cuántos años tienes tú?"), String(test2).toLower().toString()); + ASSERT_EQUALS(UTF8("сколько тебе лет?"), String(test1).toLowerToBuf(&buf, kNormal)); + ASSERT_EQUALS(UTF8("¿cuántos años tienes tú?"), String(test2).toLowerToBuf(&buf, kNormal)); TEST_CASE_FOLD_AND_STRIP_DIACRITICS( UTF8("сколько тебе лет?"), test1, kDiacriticSensitive, kNormal); @@ -115,13 +125,14 @@ TEST(UnicodeString, CaseFolding) { } TEST(UnicodeString, CaseFoldingTurkish) { + StackBufBuilder buf; const char test1[] = UTF8("KAC YASINDASINIZ"); const char test2[] = UTF8("KAC YASİNDASİNİZ"); ASSERT_EQUALS(UTF8("kac yasındasınız"), - String(test1).toLower(CaseFoldMode::kTurkish).toString()); + String(test1).toLowerToBuf(&buf, CaseFoldMode::kTurkish)); ASSERT_EQUALS(UTF8("kac yasindasiniz"), - String(test2).toLower(CaseFoldMode::kTurkish).toString()); + String(test2).toLowerToBuf(&buf, CaseFoldMode::kTurkish)); TEST_CASE_FOLD_AND_STRIP_DIACRITICS( UTF8("kac yasındasınız"), test1, kDiacriticSensitive, kTurkish); @@ -137,12 +148,6 @@ TEST(UnicodeString, CaseFoldingAndRemoveDiacritics) { // NFD Normalized Text ("CAFÉ"). const char test3[] = {'C', 'A', 'F', 'E', static_cast<char>(0xcc), static_cast<char>(0x81), 0}; - ASSERT_EQUALS(UTF8("ποσο χρονων εισαι?"), - String(test1).toLower().removeDiacritics().toString()); - ASSERT_EQUALS(UTF8("¿cuantos anos tienes tu?"), - String(test2).toLower().removeDiacritics().toString()); - ASSERT_EQUALS(UTF8("cafe"), String(test3).toLower().removeDiacritics().toString()); - TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("ποσο χρονων εισαι?"), test1, 0, kNormal); TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("¿cuantos anos tienes tu?"), test2, 0, kNormal); TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("cafe"), test3, 0, kNormal); @@ -214,13 +219,16 @@ TEST(UnicodeString, BadUTF8) { ASSERT_THROWS(String test3(invalid3), AssertionException); ASSERT_THROWS(String test4(invalid4), AssertionException); + StackBufBuilder buf; + // caseFoldAndStripDiacritics doesn't make any guarantees about behavior when fed invalid utf8. // These calls are to ensure that they don't trigger any faults in sanitizing builds. - String::caseFoldAndStripDiacritics(invalid1, 0, kNormal); - String::caseFoldAndStripDiacritics(invalid2, 0, kNormal); - String::caseFoldAndStripDiacritics(invalid3, 0, kNormal); + String::caseFoldAndStripDiacritics(&buf, invalid1, 0, kNormal); + String::caseFoldAndStripDiacritics(&buf, invalid2, 0, kNormal); + String::caseFoldAndStripDiacritics(&buf, invalid3, 0, kNormal); - ASSERT_THROWS(String::caseFoldAndStripDiacritics(invalid4, 0, kNormal), AssertionException); + ASSERT_THROWS(String::caseFoldAndStripDiacritics(&buf, invalid4, 0, kNormal), + AssertionException); } TEST(UnicodeString, UTF32ToUTF8) { |