SERVER-19936 Optimize UnicodeFTSTokenizer

author: Mathias Stearn <mathias@10gen.com> 2016-03-02 18:31:44 -0500
committer: Mathias Stearn <mathias@10gen.com> 2016-03-11 08:55:40 -0500
commit: 4b10e50494175df2b1ed8fc4f8e7f8c6ca6f06d5 (patch)
tree: 5c799a4d1b0b565854687ef4f25f8f3308afd462 /src/mongo/db/fts
parent: 72aab77138463d96494389bc538c13395c34a2d3 (diff)
download: mongo-4b10e50494175df2b1ed8fc4f8e7f8c6ca6f06d5.tar.gz
12 files changed, 191 insertions, 229 deletions
diff --git a/src/mongo/db/fts/fts_basic_tokenizer.cpp b/src/mongo/db/fts/fts_basic_tokenizer.cpp
index d8cb9874772..c43376b23ea 100644
--- a/src/mongo/db/fts/fts_basic_tokenizer.cpp
+++ b/src/mongo/db/fts/fts_basic_tokenizer.cpp
@@ -68,9 +68,7 @@ bool BasicFTSTokenizer::moveNext() {
             continue;
         }
 
-        string word = token.data.toString();
-
-        word = tolowerString(token.data);
+        string word = tolowerString(token.data);
 
         // Stop words are case-sensitive so we need them to be lower cased to check
         // against the stop word list
@@ -82,7 +80,7 @@ bool BasicFTSTokenizer::moveNext() {
             word = token.data.toString();
         }
 
-        _stem = _stemmer.stem(word);
+        _stem = _stemmer.stem(word).toString();
         return true;
     }
 }
diff --git a/src/mongo/db/fts/fts_spec_legacy.cpp b/src/mongo/db/fts/fts_spec_legacy.cpp
index 8d36b51ca61..b7867dba520 100644
--- a/src/mongo/db/fts/fts_spec_legacy.cpp
+++ b/src/mongo/db/fts/fts_spec_legacy.cpp
@@ -81,7 +81,7 @@ void FTSSpec::_scoreStringV1(const Tools& tools,
         string term = tolowerString(t.data);
         if (tools.stopwords->isStopWord(term))
             continue;
-        term = tools.stemmer->stem(term);
+        term = tools.stemmer->stem(term).toString();
 
         ScoreHelperStruct& data = terms[term];
 
diff --git a/src/mongo/db/fts/fts_unicode_tokenizer.cpp b/src/mongo/db/fts/fts_unicode_tokenizer.cpp
index 036a7703fc3..8648523e4af 100644
--- a/src/mongo/db/fts/fts_unicode_tokenizer.cpp
+++ b/src/mongo/db/fts/fts_unicode_tokenizer.cpp
@@ -45,24 +45,19 @@ namespace fts {
 using std::string;
 
 UnicodeFTSTokenizer::UnicodeFTSTokenizer(const FTSLanguage* language)
-    : _language(language), _stemmer(language), _stopWords(StopWords::getStopWords(language)) {
-    if (_language->str() == "english") {
-        _delimListLanguage = unicode::DelimiterListLanguage::kEnglish;
-    } else {
-        _delimListLanguage = unicode::DelimiterListLanguage::kNotEnglish;
-    }
-
-    if (_language->str() == "turkish") {
-        _caseFoldMode = unicode::CaseFoldMode::kTurkish;
-    } else {
-        _caseFoldMode = unicode::CaseFoldMode::kNormal;
-    }
-}
+    : _language(language),
+      _stemmer(language),
+      _stopWords(StopWords::getStopWords(language)),
+      _delimListLanguage(_language->str() == "english"
+                             ? unicode::DelimiterListLanguage::kEnglish
+                             : unicode::DelimiterListLanguage::kNotEnglish),
+      _caseFoldMode(_language->str() == "turkish" ? unicode::CaseFoldMode::kTurkish
+                                                  : unicode::CaseFoldMode::kNormal) {}
 
 void UnicodeFTSTokenizer::reset(StringData document, Options options) {
     _options = options;
     _pos = 0;
-    _document.resetData(document);
+    _document.resetData(document);  // Validates that document is valid UTF8.
 
     // Skip any leading delimiters (and handle the case where the document is entirely delimiters).
     _skipDelimiters();
@@ -71,7 +66,7 @@ void UnicodeFTSTokenizer::reset(StringData document, Options options) {
 bool UnicodeFTSTokenizer::moveNext() {
     while (true) {
         if (_pos >= _document.size()) {
-            _stem = "";
+            _word = "";
             return false;
         }
 
@@ -81,30 +76,30 @@ bool UnicodeFTSTokenizer::moveNext() {
                (!unicode::codepointIsDelimiter(_document[_pos], _delimListLanguage))) {
             ++_pos;
         }
-        _document.substrToBuf(start, _pos - start, _tokenBuf);
+        const size_t len = _pos - start;
 
         // Skip the delimiters before the next token.
         _skipDelimiters();
 
         // Stop words are case-sensitive and diacritic sensitive, so we need them to be lower cased
         // but with diacritics not removed to check against the stop word list.
-        _tokenBuf.toLowerToBuf(_caseFoldMode, _wordBuf);
+        _word = _document.toLowerToBuf(&_wordBuf, _caseFoldMode, start, len);
 
-        if ((_options & kFilterStopWords) && _stopWords->isStopWord(_wordBuf.toString())) {
+        if ((_options & kFilterStopWords) && _stopWords->isStopWord(_word)) {
             continue;
         }
 
         if (_options & kGenerateCaseSensitiveTokens) {
-            _tokenBuf.copyToBuf(_wordBuf);
+            _word = _document.substrToBuf(&_wordBuf, start, len);
         }
 
         // The stemmer is diacritic sensitive, so stem the word before removing diacritics.
-        _stem = _stemmer.stem(_wordBuf.toString());
+        _word = _stemmer.stem(_word);
 
         if (!(_options & kGenerateDiacriticSensitiveTokens)) {
-            _tokenBuf.resetData(_stem);
-            _tokenBuf.removeDiacriticsToBuf(_wordBuf);
-            _stem = _wordBuf.toString();
+            // Can't use _wordbuf for output here because our input _word may point into it.
+            _word = unicode::String::caseFoldAndStripDiacritics(
+                &_finalBuf, _word, unicode::String::kCaseSensitive, _caseFoldMode);
         }
 
         return true;
@@ -112,7 +107,7 @@ bool UnicodeFTSTokenizer::moveNext() {
 }
 
 StringData UnicodeFTSTokenizer::get() const {
-    return _stem;
+    return _word;
 }
 
 void UnicodeFTSTokenizer::_skipDelimiters() {
diff --git a/src/mongo/db/fts/fts_unicode_tokenizer.h b/src/mongo/db/fts/fts_unicode_tokenizer.h
index c6feeca9725..2b8c54e3e88 100644
--- a/src/mongo/db/fts/fts_unicode_tokenizer.h
+++ b/src/mongo/db/fts/fts_unicode_tokenizer.h
@@ -73,22 +73,19 @@ private:
      */
     void _skipDelimiters();
 
-    unicode::DelimiterListLanguage _delimListLanguage;
-    unicode::CaseFoldMode _caseFoldMode;
-
     const FTSLanguage* const _language;
     const Stemmer _stemmer;
     const StopWords* const _stopWords;
+    const unicode::DelimiterListLanguage _delimListLanguage;
+    const unicode::CaseFoldMode _caseFoldMode;
 
     unicode::String _document;
     size_t _pos;
-
-    unicode::String _tokenBuf;
-    unicode::String _wordBuf;
-
+    StringData _word;
     Options _options;
 
-    std::string _stem;
+    StackBufBuilder _wordBuf;
+    StackBufBuilder _finalBuf;
 };
 
 }  // namespace fts
diff --git a/src/mongo/db/fts/stemmer.cpp b/src/mongo/db/fts/stemmer.cpp
index 07d17c050eb..19995baae4e 100644
--- a/src/mongo/db/fts/stemmer.cpp
+++ b/src/mongo/db/fts/stemmer.cpp
@@ -29,7 +29,6 @@
 */
 
 #include <cstdlib>
-#include <string>
 
 #include "mongo/db/fts/stemmer.h"
 #include "mongo/util/mongoutils/str.h"
@@ -38,8 +37,6 @@ namespace mongo {
 
 namespace fts {
 
-using std::string;
-
 Stemmer::Stemmer(const FTSLanguage* language) {
     _stemmer = NULL;
     if (language->str() != "none")
@@ -53,9 +50,9 @@ Stemmer::~Stemmer() {
     }
 }
 
-string Stemmer::stem(StringData word) const {
+StringData Stemmer::stem(StringData word) const {
     if (!_stemmer)
-        return word.toString();
+        return word;
 
     const sb_symbol* sb_sym =
         sb_stemmer_stem(_stemmer, (const sb_symbol*)word.rawData(), word.size());
@@ -65,7 +62,7 @@ string Stemmer::stem(StringData word) const {
         invariant(false);
     }
 
-    return string((const char*)(sb_sym), sb_stemmer_length(_stemmer));
+    return StringData((const char*)(sb_sym), sb_stemmer_length(_stemmer));
 }
 }
 }
diff --git a/src/mongo/db/fts/stemmer.h b/src/mongo/db/fts/stemmer.h
index 59261bfb6a0..80bfdc2faf9 100644
--- a/src/mongo/db/fts/stemmer.h
+++ b/src/mongo/db/fts/stemmer.h
@@ -31,8 +31,6 @@
 
 #pragma once
 
-#include <string>
-
 #include "mongo/base/string_data.h"
 #include "mongo/db/fts/fts_language.h"
 #include "third_party/libstemmer_c/include/libstemmer.h"
@@ -53,7 +51,14 @@ public:
     Stemmer(const FTSLanguage* language);
     ~Stemmer();
 
-    std::string stem(StringData word) const;
+    /**
+     * Stems an input word.
+     *
+     * The returned StringData is valid until the next call to any method on this object. Since the
+     * input may be returned unmodified, the output's lifetime may also expire when the input's
+     * does.
+     */
+    StringData stem(StringData word) const;
 
 private:
     struct sb_stemmer* _stemmer;
diff --git a/src/mongo/db/fts/stop_words.cpp b/src/mongo/db/fts/stop_words.cpp
index 0a44eaf25ff..6e108db70fc 100644
--- a/src/mongo/db/fts/stop_words.cpp
+++ b/src/mongo/db/fts/stop_words.cpp
@@ -52,7 +52,7 @@ StopWords::StopWords() {}
 
 StopWords::StopWords(const std::set<std::string>& words) {
     for (std::set<std::string>::const_iterator i = words.begin(); i != words.end(); ++i)
-        _words.insert(*i);
+        _words[*i] = true;
 }
 
 const StopWords* StopWords::getStopWords(const FTSLanguage* language) {
diff --git a/src/mongo/db/fts/stop_words.h b/src/mongo/db/fts/stop_words.h
index eebc11c012a..f80b638510d 100644
--- a/src/mongo/db/fts/stop_words.h
+++ b/src/mongo/db/fts/stop_words.h
@@ -35,7 +35,7 @@
 #include <string>
 
 #include "mongo/db/fts/fts_language.h"
-#include "mongo/platform/unordered_set.h"
+#include "mongo/util/string_map.h"
 
 namespace mongo {
 
@@ -48,8 +48,8 @@ public:
     StopWords();
     StopWords(const std::set<std::string>& words);
 
-    bool isStopWord(const std::string& word) const {
-        return _words.count(word) > 0;
+    bool isStopWord(StringData word) const {
+        return _words.find(word) != _words.end();
     }
 
     size_t numStopWords() const {
@@ -59,7 +59,7 @@ public:
     static const StopWords* getStopWords(const FTSLanguage* language);
 
 private:
-    unordered_set<std::string> _words;
+    StringMap<bool> _words;  // Used as a set. The values have no meaning.
 };
 }
 }
diff --git a/src/mongo/db/fts/unicode/gen_delimiter_list.py b/src/mongo/db/fts/unicode/gen_delimiter_list.py
index d8a788c74ff..3c3131a8f6b 100644
--- a/src/mongo/db/fts/unicode/gen_delimiter_list.py
+++ b/src/mongo/db/fts/unicode/gen_delimiter_list.py
@@ -58,23 +58,32 @@ def generate(unicode_proplist_file, target):
 
     # As of Unicode 8.0.0, all of the delimiters we used for text index 
     # version 2 are also in the list.
-
-    out.write("""bool codepointIsDelimiter(char32_t codepoint, \
-DelimiterListLanguage lang) {
-    if (lang == DelimiterListLanguage::kEnglish && codepoint == '\\'') {
-        return false;
-    }
-
-    // Most characters are latin letters, so filter those out first.
-    if (codepoint >= 'A' && codepoint <= 'Z') {
-        return false;
-    } else if (codepoint >= 'a' && codepoint <= 'z') {
-        return false;
+    out.write("static const bool englishAsciiDelimiters[128] = {\n")
+    for cp in range(0x80):
+        if cp == ord("'"):
+            out.write("    0, // ' special case\n")
+        else:
+            out.write("    %d, // 0x%x\n" % (cp in delim_codepoints, cp))
+    out.write("};\n")
+
+    out.write("static const bool nonEnglishAsciiDelimiters[128] = {\n")
+    for cp in range(0x80):
+        out.write("    %d, // 0x%x\n" % (cp in delim_codepoints, cp))
+    out.write("};\n")
+
+    out.write("""bool codepointIsDelimiter(char32_t codepoint, DelimiterListLanguage lang) {
+    if (codepoint <= 0x7f) {
+        if (lang == DelimiterListLanguage::kEnglish) {
+            return englishAsciiDelimiters[codepoint];
+        }
+        return nonEnglishAsciiDelimiters[codepoint];
     }
 
     switch (codepoint) {\n""")
 
     for delim in sorted(delim_codepoints):
+        if delim <= 0x7f: # ascii codepoints handled in lists above.
+            continue
         out.write("\
     case " + str(hex(delim)) + ": return true;\n")
 
diff --git a/src/mongo/db/fts/unicode/string.cpp b/src/mongo/db/fts/unicode/string.cpp
index 3218f04dbf7..10737acc3ed 100644
--- a/src/mongo/db/fts/unicode/string.cpp
+++ b/src/mongo/db/fts/unicode/string.cpp
@@ -39,6 +39,28 @@
 namespace mongo {
 namespace unicode {
 
+namespace {
+template <typename OutputIterator>
+inline void appendUtf8Codepoint(char32_t codepoint, OutputIterator* outputIt) {
+    if (codepoint <= 0x7f /* max 1-byte codepoint */) {
+        *(*outputIt)++ = (codepoint);
+    } else if (codepoint <= 0x7ff /* max 2-byte codepoint*/) {
+        *(*outputIt)++ = ((codepoint >> (6 * 1)) | 0xc0);  // 2 leading 1s.
+        *(*outputIt)++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
+    } else if (codepoint <= 0xffff /* max 3-byte codepoint*/) {
+        *(*outputIt)++ = ((codepoint >> (6 * 2)) | 0xe0);  // 3 leading 1s.
+        *(*outputIt)++ = (((codepoint >> (6 * 1)) & 0x3f) | 0x80);
+        *(*outputIt)++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
+    } else {
+        uassert(ErrorCodes::BadValue, "text contains invalid UTF-8", codepoint <= 0x10FFFF);
+        *(*outputIt)++ = ((codepoint >> (6 * 3)) | 0xf0);  // 4 leading 1s.
+        *(*outputIt)++ = (((codepoint >> (6 * 2)) & 0x3f) | 0x80);
+        *(*outputIt)++ = (((codepoint >> (6 * 1)) & 0x3f) | 0x80);
+        *(*outputIt)++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
+    }
+}
+}
+
 using linenoise_utf8::copyString32to8;
 using linenoise_utf8::copyString8to32;
 
@@ -93,74 +115,45 @@ std::string String::toString() {
     return _outputBuf;
 }
 
-String String::substr(size_t pos, size_t len) const {
-    unicode::String buf;
-    substrToBuf(pos, len, buf);
-    return buf;
-}
-
-String String::toLower(CaseFoldMode mode) const {
-    unicode::String buf;
-    toLowerToBuf(mode, buf);
-    return buf;
-}
-
-String String::removeDiacritics() const {
-    unicode::String buf;
-    removeDiacriticsToBuf(buf);
-    return buf;
-}
-
-void String::copyToBuf(String& buffer) const {
-    buffer._data = _data;
-    buffer._data.resize(_data.size());
-    auto index = 0;
-    for (auto codepoint : _data) {
-        buffer._data[index++] = codepoint;
+template <typename Func>
+StringData String::substrToBufWithTransform(StackBufBuilder* buffer,
+                                            size_t pos,
+                                            size_t len,
+                                            Func func) const {
+    pos = std::min(pos, _data.size());
+    len = std::min(len, _data.size() - pos);
+
+    buffer->reset();
+    auto outputIt = buffer->skip(len * 4);  // Reserve room for worst-case expansion.
+    auto inputIt = _data.begin() + pos;
+    for (size_t i = 0; i < len; i++) {
+        appendUtf8Codepoint(func(*inputIt++), &outputIt);
     }
-    buffer._needsOutputConversion = true;
+    buffer->setlen(outputIt - buffer->buf());
+    return {buffer->buf(), size_t(buffer->len())};
 }
 
-void String::substrToBuf(size_t pos, size_t len, String& buffer) const {
-    buffer._data.resize(len + 1);
-    for (size_t index = 0, src_pos = pos; index < len;) {
-        buffer._data[index++] = _data[src_pos++];
-    }
-    buffer._data[len] = '\0';
-    buffer._needsOutputConversion = true;
+StringData String::substrToBuf(StackBufBuilder* buffer, size_t pos, size_t len) const {
+    const auto identityFunc = [](char32_t ch) { return ch; };
+    return substrToBufWithTransform(buffer, pos, len, identityFunc);
 }
 
-void String::toLowerToBuf(CaseFoldMode mode, String& buffer) const {
-    buffer._data.resize(_data.size());
-    auto outIt = buffer._data.begin();
-    for (auto codepoint : _data) {
-        *outIt++ = codepointToLower(codepoint, mode);
-    }
-    buffer._needsOutputConversion = true;
+StringData String::toLowerToBuf(StackBufBuilder* buffer,
+                                CaseFoldMode mode,
+                                size_t pos,
+                                size_t len) const {
+    const auto toLower = [mode](char32_t ch) { return codepointToLower(ch, mode); };
+    return substrToBufWithTransform(buffer, pos, len, toLower);
 }
 
-void String::removeDiacriticsToBuf(String& buffer) const {
-    buffer._data.resize(_data.size());
-    auto outIt = buffer._data.begin();
-    for (auto codepoint : _data) {
-        if (codepoint <= 0x7f) {
-            // ASCII only has two diacritics so they are hard-coded here.
-            if (codepoint != '^' && codepoint != '`') {
-                *outIt++ = codepoint;
-            }
-        } else if (auto clean = codepointRemoveDiacritics(codepoint)) {
-            *outIt++ = clean;
-        } else {
-            // codepoint was a pure diacritic mark, so skip it.
-        }
-    }
-    buffer._data.resize(outIt - buffer._data.begin());
-    buffer._needsOutputConversion = true;
-}
 
-String::MaybeOwnedStringData String::caseFoldAndStripDiacritics(StringData utf8,
-                                                                SubstrMatchOptions options,
-                                                                CaseFoldMode mode) {
+StringData String::caseFoldAndStripDiacritics(StackBufBuilder* buffer,
+                                              StringData utf8,
+                                              SubstrMatchOptions options,
+                                              CaseFoldMode mode) {
+    // This fires if your input buffer the same as your output buffer.
+    invariant(buffer->buf() != utf8.rawData());
+
     if ((options & kCaseSensitive) && (options & kDiacriticSensitive)) {
         // No transformation needed. Just return the input data unmodified.
         return utf8;
@@ -170,8 +163,8 @@ String::MaybeOwnedStringData String::caseFoldAndStripDiacritics(StringData utf8,
     // and casefolding. Proof: the only case where 1 byte goes to >1 is 'I' in Turkish going to 2
     // bytes. The biggest codepoint is 4 bytes which is also 2x 2 bytes. This holds as long as we
     // don't map a single code point to more than one.
-    std::unique_ptr<char[]> buffer(new char[utf8.size() * 2]);
-    auto outputIt = buffer.get();
+    buffer->reset();
+    auto outputIt = buffer->skip(utf8.size() * 2);
 
     for (auto inputIt = utf8.begin(), endIt = utf8.end(); inputIt != endIt;) {
 #ifdef MONGO_HAVE_FAST_BYTE_VECTOR
@@ -258,25 +251,11 @@ String::MaybeOwnedStringData String::caseFoldAndStripDiacritics(StringData utf8,
             }
         }
 
-        // Back to utf-8.
-        if (codepoint <= 0x7f /* max 1-byte codepoint */) {
-            *outputIt++ = (codepoint);
-        } else if (codepoint <= 0x7ff /* max 2-byte codepoint*/) {
-            *outputIt++ = ((codepoint >> (6 * 1)) | 0xc0);  // 2 leading 1s.
-            *outputIt++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
-        } else if (codepoint <= 0xffff /* max 3-byte codepoint*/) {
-            *outputIt++ = ((codepoint >> (6 * 2)) | 0xe0);  // 3 leading 1s.
-            *outputIt++ = (((codepoint >> (6 * 1)) & 0x3f) | 0x80);
-            *outputIt++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
-        } else {
-            *outputIt++ = ((codepoint >> (6 * 3)) | 0xf0);  // 4 leading 1s.
-            *outputIt++ = (((codepoint >> (6 * 2)) & 0x3f) | 0x80);
-            *outputIt++ = (((codepoint >> (6 * 1)) & 0x3f) | 0x80);
-            *outputIt++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
-        }
+        appendUtf8Codepoint(codepoint, &outputIt);
     }
 
-    return {std::move(buffer), outputIt};
+    buffer->setlen(outputIt - buffer->buf());
+    return {buffer->buf(), size_t(buffer->len())};
 }
 
 bool String::substrMatch(const std::string& str,
@@ -288,8 +267,10 @@ bool String::substrMatch(const std::string& str,
         options &= ~kCaseSensitive;
     }
 
-    auto haystack = caseFoldAndStripDiacritics(str, options, cfMode);
-    auto needle = caseFoldAndStripDiacritics(find, options, cfMode);
+    StackBufBuilder haystackBuf;
+    StackBufBuilder needleBuf;
+    auto haystack = caseFoldAndStripDiacritics(&haystackBuf, str, options, cfMode);
+    auto needle = caseFoldAndStripDiacritics(&needleBuf, find, options, cfMode);
 
     // Case sensitive and diacritic sensitive.
     return boost::algorithm::boyer_moore_search(
diff --git a/src/mongo/db/fts/unicode/string.h b/src/mongo/db/fts/unicode/string.h
index 64a0d89918b..dac83ed3b24 100644
--- a/src/mongo/db/fts/unicode/string.h
+++ b/src/mongo/db/fts/unicode/string.h
@@ -33,6 +33,7 @@
 #include <string>
 
 #include "mongo/base/string_data.h"
+#include "mongo/bson/util/builder.h"
 #include "mongo/db/fts/unicode/codepoints.h"
 
 namespace mongo {
@@ -45,34 +46,6 @@ namespace unicode {
  */
 class String {
 public:
-    /**
-     * A StringData that may own its own buffer.
-     */
-    class MaybeOwnedStringData : public StringData {
-    public:
-        /**
-         * Makes an empty, unowned string.
-         */
-        MaybeOwnedStringData() = default;
-
-        /**
-         * Makes an owned string.
-         */
-        MaybeOwnedStringData(std::unique_ptr<char[]>&& buffer, const char* endIt)
-            : StringData(buffer.get(), endIt - buffer.get()), _buffer(std::move(buffer)) {}
-
-        /**
-         * Makes an unowned string.
-         */
-        /*implicit*/ MaybeOwnedStringData(StringData str) : StringData(str) {}
-        MaybeOwnedStringData& operator=(StringData str) {
-            return (*this = MaybeOwnedStringData(str));
-        }
-
-    private:
-        std::unique_ptr<char[]> _buffer;
-    };
-
     String() = default;
 
 #if defined(_MSC_VER) && _MSC_VER < 1900
@@ -96,39 +69,19 @@ public:
     void resetData(const StringData utf8_src);
 
     /**
-     * Return a lowercased version of the String instance.
-     */
-    String toLower(CaseFoldMode mode = CaseFoldMode::kNormal) const;
-
-    /**
-     * Returns a version of the String instance with diacritics and combining marks removed.
-     */
-    String removeDiacritics() const;
-
-    /**
-     * Returns a substring of the String instance, using the same semantics as std::string::substr.
-     */
-    String substr(size_t begin, size_t end) const;
-
-    /**
-     * Copies the current String to another String.
-     */
-    void copyToBuf(String& buffer) const;
-
-    /**
      * Takes a substring of the current String and puts it in another String.
+     * Overwrites buffer's previous contents rather than appending.
      */
-    void substrToBuf(size_t pos, size_t len, String& buffer) const;
-
-    /**
-     * Lowercases the current String and stores the result in another String.
-     */
-    void toLowerToBuf(CaseFoldMode mode, String& buffer) const;
+    StringData substrToBuf(StackBufBuilder* buffer, size_t pos, size_t len) const;
 
     /**
-     * Removes diacritics from the current String and stores the result in another String.
+     * Lowercases a substring of the current String and stores the UTF8 result in buffer.
+     * Overwrites buffer's previous contents rather than appending.
      */
-    void removeDiacriticsToBuf(String& buffer) const;
+    StringData toLowerToBuf(StackBufBuilder* buffer,
+                            CaseFoldMode mode,
+                            size_t offset = 0,
+                            size_t len = std::string::npos) const;
 
     /**
      * Returns a UTF-8 encoded std::string version of the String instance. Uses the conversion
@@ -183,10 +136,20 @@ public:
 
     /**
      * Strips diacritics and case-folds the utf8 input string, as needed to support options.
-     */
-    static MaybeOwnedStringData caseFoldAndStripDiacritics(StringData utf8,
-                                                           SubstrMatchOptions options,
-                                                           CaseFoldMode mode);
+     *
+     * The options field specifies what operations to *skip*, so kCaseSensitive means to skip case
+     * folding and kDiacriticSensitive means to skip diacritic striping. If both flags are
+     * specified, the input utf8 StringData is returned directly without any processing or copying.
+     *
+     * If processing is performed, the returned StringData will be placed in buffer. buffer's
+     * contents (if any) will be replaced. Since we may return the input unmodified the returned
+     * StringData's lifetime is the shorter of the input utf8 and the next modification to buffer.
+     * The input utf8 must not point into buffer.
+     */
+    static StringData caseFoldAndStripDiacritics(StackBufBuilder* buffer,
+                                                 StringData utf8,
+                                                 SubstrMatchOptions options,
+                                                 CaseFoldMode mode);
 
 private:
     /**
@@ -195,6 +158,15 @@ private:
     void setData(const StringData utf8_src);
 
     /**
+     * Unified implementation of substrToBuf and toLowerToBuf.
+     */
+    template <typename Func>
+    StringData substrToBufWithTransform(StackBufBuilder* buffer,
+                                        size_t pos,
+                                        size_t len,
+                                        Func transform) const;
+
+    /**
      * The underlying UTF-32 data.
      */
     std::u32string _data;
diff --git a/src/mongo/db/fts/unicode/string_test.cpp b/src/mongo/db/fts/unicode/string_test.cpp
index d627120e9e7..e67228f76de 100644
--- a/src/mongo/db/fts/unicode/string_test.cpp
+++ b/src/mongo/db/fts/unicode/string_test.cpp
@@ -61,19 +61,30 @@ auto kNormal = CaseFoldMode::kNormal;
 
 
 // Macro to preserve line numbers and arguments in error messages.
-#define TEST_CASE_FOLD_AND_STRIP_DIACRITICS(expected, input, options, caseFoldMode)        \
-    ASSERT_EQ(expected, String::caseFoldAndStripDiacritics(input, options, caseFoldMode)); \
-    ASSERT_EQ(expected + filler,                                                           \
-              String::caseFoldAndStripDiacritics(input + filler, options, caseFoldMode))
+#define TEST_CASE_FOLD_AND_STRIP_DIACRITICS(expected, input, options, caseFoldMode)           \
+    do {                                                                                      \
+        StackBufBuilder buf;                                                                  \
+        ASSERT_EQ(expected,                                                                   \
+                  String::caseFoldAndStripDiacritics(&buf, input, options, caseFoldMode));    \
+        ASSERT_EQ(                                                                            \
+            expected + filler,                                                                \
+            String::caseFoldAndStripDiacritics(&buf, input + filler, options, caseFoldMode)); \
+    } while (0)
+
+TEST(UnicodeString, SubstrTest) {
+    StackBufBuilder buf;
+    String indexes("01234");
+    ASSERT_EQ("123", indexes.substrToBuf(&buf, 1, 3));
+    ASSERT_EQ("4", indexes.substrToBuf(&buf, 4, 3));  // len too long.
+    ASSERT_EQ("", indexes.substrToBuf(&buf, 6, 3));   // pos past end.
+    ASSERT_EQ("", indexes.substrToBuf(&buf, 1, 0));   // len == 0.
+}
 
 TEST(UnicodeString, RemoveDiacritics) {
     // Test all ascii chars.
     for (unsigned char ch = 0; ch <= 0x7F; ch++) {
         const auto input = std::string(1, ch);
         const auto output = codepointIsDiacritic(ch) ? std::string() : std::string(1, ch);
-        if (ch) {  // String's constructor doesn't handle embedded NUL bytes.
-            ASSERT_EQUALS(output, String(input).removeDiacritics().toString());
-        }
         TEST_CASE_FOLD_AND_STRIP_DIACRITICS(output, input, kCaseSensitive, kNormal);
     }
 
@@ -83,21 +94,20 @@ TEST(UnicodeString, RemoveDiacritics) {
     // NFD Normalized Text ("Café").
     const char test2[] = {'C', 'a', 'f', 'e', static_cast<char>(0xcc), static_cast<char>(0x81), 0};
 
-    ASSERT_EQUALS(UTF8("¿CUANTOS ANOS TIENES TU?"), String(test1).removeDiacritics().toString());
-    ASSERT_EQUALS(UTF8("Cafe"), String(test2).removeDiacritics().toString());
-
     TEST_CASE_FOLD_AND_STRIP_DIACRITICS(
         UTF8("¿CUANTOS ANOS TIENES TU?"), test1, kCaseSensitive, kNormal);
     TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("Cafe"), test2, kCaseSensitive, kNormal);
 }
 
 TEST(UnicodeString, CaseFolding) {
+    StackBufBuilder buf;
+
     // Test all ascii chars.
     for (unsigned char ch = 0; ch <= 0x7F; ch++) {
         const auto upper = std::string(1, ch);
         const auto lower = std::string(1, std::tolower(ch));
         if (ch) {  // String's constructor doesn't handle embedded NUL bytes.
-            ASSERT_EQUALS(lower, String(upper).toLower().toString());
+            ASSERT_EQUALS(lower, String(upper).toLowerToBuf(&buf, kNormal));
         }
         TEST_CASE_FOLD_AND_STRIP_DIACRITICS(lower, upper, kDiacriticSensitive, kNormal);
     }
@@ -105,8 +115,8 @@ TEST(UnicodeString, CaseFolding) {
     const char test1[] = UTF8("СКОЛЬКО ТЕБЕ ЛЕТ?");
     const char test2[] = UTF8("¿CUÁNTOS AÑOS TIENES TÚ?");
 
-    ASSERT_EQUALS(UTF8("сколько тебе лет?"), String(test1).toLower().toString());
-    ASSERT_EQUALS(UTF8("¿cuántos años tienes tú?"), String(test2).toLower().toString());
+    ASSERT_EQUALS(UTF8("сколько тебе лет?"), String(test1).toLowerToBuf(&buf, kNormal));
+    ASSERT_EQUALS(UTF8("¿cuántos años tienes tú?"), String(test2).toLowerToBuf(&buf, kNormal));
 
     TEST_CASE_FOLD_AND_STRIP_DIACRITICS(
         UTF8("сколько тебе лет?"), test1, kDiacriticSensitive, kNormal);
@@ -115,13 +125,14 @@ TEST(UnicodeString, CaseFolding) {
 }
 
 TEST(UnicodeString, CaseFoldingTurkish) {
+    StackBufBuilder buf;
     const char test1[] = UTF8("KAC YASINDASINIZ");
     const char test2[] = UTF8("KAC YASİNDASİNİZ");
 
     ASSERT_EQUALS(UTF8("kac yasındasınız"),
-                  String(test1).toLower(CaseFoldMode::kTurkish).toString());
+                  String(test1).toLowerToBuf(&buf, CaseFoldMode::kTurkish));
     ASSERT_EQUALS(UTF8("kac yasindasiniz"),
-                  String(test2).toLower(CaseFoldMode::kTurkish).toString());
+                  String(test2).toLowerToBuf(&buf, CaseFoldMode::kTurkish));
 
     TEST_CASE_FOLD_AND_STRIP_DIACRITICS(
         UTF8("kac yasındasınız"), test1, kDiacriticSensitive, kTurkish);
@@ -137,12 +148,6 @@ TEST(UnicodeString, CaseFoldingAndRemoveDiacritics) {
     // NFD Normalized Text ("CAFÉ").
     const char test3[] = {'C', 'A', 'F', 'E', static_cast<char>(0xcc), static_cast<char>(0x81), 0};
 
-    ASSERT_EQUALS(UTF8("ποσο χρονων εισαι?"),
-                  String(test1).toLower().removeDiacritics().toString());
-    ASSERT_EQUALS(UTF8("¿cuantos anos tienes tu?"),
-                  String(test2).toLower().removeDiacritics().toString());
-    ASSERT_EQUALS(UTF8("cafe"), String(test3).toLower().removeDiacritics().toString());
-
     TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("ποσο χρονων εισαι?"), test1, 0, kNormal);
     TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("¿cuantos anos tienes tu?"), test2, 0, kNormal);
     TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("cafe"), test3, 0, kNormal);
@@ -214,13 +219,16 @@ TEST(UnicodeString, BadUTF8) {
     ASSERT_THROWS(String test3(invalid3), AssertionException);
     ASSERT_THROWS(String test4(invalid4), AssertionException);
 
+    StackBufBuilder buf;
+
     // caseFoldAndStripDiacritics doesn't make any guarantees about behavior when fed invalid utf8.
     // These calls are to ensure that they don't trigger any faults in sanitizing builds.
-    String::caseFoldAndStripDiacritics(invalid1, 0, kNormal);
-    String::caseFoldAndStripDiacritics(invalid2, 0, kNormal);
-    String::caseFoldAndStripDiacritics(invalid3, 0, kNormal);
+    String::caseFoldAndStripDiacritics(&buf, invalid1, 0, kNormal);
+    String::caseFoldAndStripDiacritics(&buf, invalid2, 0, kNormal);
+    String::caseFoldAndStripDiacritics(&buf, invalid3, 0, kNormal);
 
-    ASSERT_THROWS(String::caseFoldAndStripDiacritics(invalid4, 0, kNormal), AssertionException);
+    ASSERT_THROWS(String::caseFoldAndStripDiacritics(&buf, invalid4, 0, kNormal),
+                  AssertionException);
 }
 
 TEST(UnicodeString, UTF32ToUTF8) {
author	Mathias Stearn <mathias@10gen.com>	2016-03-02 18:31:44 -0500
committer	Mathias Stearn <mathias@10gen.com>	2016-03-11 08:55:40 -0500
commit	4b10e50494175df2b1ed8fc4f8e7f8c6ca6f06d5 (patch)
tree	5c799a4d1b0b565854687ef4f25f8f3308afd462 /src/mongo/db/fts
parent	72aab77138463d96494389bc538c13395c34a2d3 (diff)
download	mongo-4b10e50494175df2b1ed8fc4f8e7f8c6ca6f06d5.tar.gz