summaryrefslogtreecommitdiff
path: root/src/mongo/db/fts
diff options
context:
space:
mode:
authorMathias Stearn <mathias@10gen.com>2016-03-02 18:31:44 -0500
committerMathias Stearn <mathias@10gen.com>2016-03-11 08:55:40 -0500
commit4b10e50494175df2b1ed8fc4f8e7f8c6ca6f06d5 (patch)
tree5c799a4d1b0b565854687ef4f25f8f3308afd462 /src/mongo/db/fts
parent72aab77138463d96494389bc538c13395c34a2d3 (diff)
downloadmongo-4b10e50494175df2b1ed8fc4f8e7f8c6ca6f06d5.tar.gz
SERVER-19936 Optimize UnicodeFTSTokenizer
Diffstat (limited to 'src/mongo/db/fts')
-rw-r--r--src/mongo/db/fts/fts_basic_tokenizer.cpp6
-rw-r--r--src/mongo/db/fts/fts_spec_legacy.cpp2
-rw-r--r--src/mongo/db/fts/fts_unicode_tokenizer.cpp43
-rw-r--r--src/mongo/db/fts/fts_unicode_tokenizer.h13
-rw-r--r--src/mongo/db/fts/stemmer.cpp9
-rw-r--r--src/mongo/db/fts/stemmer.h11
-rw-r--r--src/mongo/db/fts/stop_words.cpp2
-rw-r--r--src/mongo/db/fts/stop_words.h8
-rw-r--r--src/mongo/db/fts/unicode/gen_delimiter_list.py33
-rw-r--r--src/mongo/db/fts/unicode/string.cpp143
-rw-r--r--src/mongo/db/fts/unicode/string.h92
-rw-r--r--src/mongo/db/fts/unicode/string_test.cpp58
12 files changed, 191 insertions, 229 deletions
diff --git a/src/mongo/db/fts/fts_basic_tokenizer.cpp b/src/mongo/db/fts/fts_basic_tokenizer.cpp
index d8cb9874772..c43376b23ea 100644
--- a/src/mongo/db/fts/fts_basic_tokenizer.cpp
+++ b/src/mongo/db/fts/fts_basic_tokenizer.cpp
@@ -68,9 +68,7 @@ bool BasicFTSTokenizer::moveNext() {
continue;
}
- string word = token.data.toString();
-
- word = tolowerString(token.data);
+ string word = tolowerString(token.data);
// Stop words are case-sensitive so we need them to be lower cased to check
// against the stop word list
@@ -82,7 +80,7 @@ bool BasicFTSTokenizer::moveNext() {
word = token.data.toString();
}
- _stem = _stemmer.stem(word);
+ _stem = _stemmer.stem(word).toString();
return true;
}
}
diff --git a/src/mongo/db/fts/fts_spec_legacy.cpp b/src/mongo/db/fts/fts_spec_legacy.cpp
index 8d36b51ca61..b7867dba520 100644
--- a/src/mongo/db/fts/fts_spec_legacy.cpp
+++ b/src/mongo/db/fts/fts_spec_legacy.cpp
@@ -81,7 +81,7 @@ void FTSSpec::_scoreStringV1(const Tools& tools,
string term = tolowerString(t.data);
if (tools.stopwords->isStopWord(term))
continue;
- term = tools.stemmer->stem(term);
+ term = tools.stemmer->stem(term).toString();
ScoreHelperStruct& data = terms[term];
diff --git a/src/mongo/db/fts/fts_unicode_tokenizer.cpp b/src/mongo/db/fts/fts_unicode_tokenizer.cpp
index 036a7703fc3..8648523e4af 100644
--- a/src/mongo/db/fts/fts_unicode_tokenizer.cpp
+++ b/src/mongo/db/fts/fts_unicode_tokenizer.cpp
@@ -45,24 +45,19 @@ namespace fts {
using std::string;
UnicodeFTSTokenizer::UnicodeFTSTokenizer(const FTSLanguage* language)
- : _language(language), _stemmer(language), _stopWords(StopWords::getStopWords(language)) {
- if (_language->str() == "english") {
- _delimListLanguage = unicode::DelimiterListLanguage::kEnglish;
- } else {
- _delimListLanguage = unicode::DelimiterListLanguage::kNotEnglish;
- }
-
- if (_language->str() == "turkish") {
- _caseFoldMode = unicode::CaseFoldMode::kTurkish;
- } else {
- _caseFoldMode = unicode::CaseFoldMode::kNormal;
- }
-}
+ : _language(language),
+ _stemmer(language),
+ _stopWords(StopWords::getStopWords(language)),
+ _delimListLanguage(_language->str() == "english"
+ ? unicode::DelimiterListLanguage::kEnglish
+ : unicode::DelimiterListLanguage::kNotEnglish),
+ _caseFoldMode(_language->str() == "turkish" ? unicode::CaseFoldMode::kTurkish
+ : unicode::CaseFoldMode::kNormal) {}
void UnicodeFTSTokenizer::reset(StringData document, Options options) {
_options = options;
_pos = 0;
- _document.resetData(document);
+ _document.resetData(document); // Validates that document is valid UTF8.
// Skip any leading delimiters (and handle the case where the document is entirely delimiters).
_skipDelimiters();
@@ -71,7 +66,7 @@ void UnicodeFTSTokenizer::reset(StringData document, Options options) {
bool UnicodeFTSTokenizer::moveNext() {
while (true) {
if (_pos >= _document.size()) {
- _stem = "";
+ _word = "";
return false;
}
@@ -81,30 +76,30 @@ bool UnicodeFTSTokenizer::moveNext() {
(!unicode::codepointIsDelimiter(_document[_pos], _delimListLanguage))) {
++_pos;
}
- _document.substrToBuf(start, _pos - start, _tokenBuf);
+ const size_t len = _pos - start;
// Skip the delimiters before the next token.
_skipDelimiters();
// Stop words are case-sensitive and diacritic sensitive, so we need them to be lower cased
// but with diacritics not removed to check against the stop word list.
- _tokenBuf.toLowerToBuf(_caseFoldMode, _wordBuf);
+ _word = _document.toLowerToBuf(&_wordBuf, _caseFoldMode, start, len);
- if ((_options & kFilterStopWords) && _stopWords->isStopWord(_wordBuf.toString())) {
+ if ((_options & kFilterStopWords) && _stopWords->isStopWord(_word)) {
continue;
}
if (_options & kGenerateCaseSensitiveTokens) {
- _tokenBuf.copyToBuf(_wordBuf);
+ _word = _document.substrToBuf(&_wordBuf, start, len);
}
// The stemmer is diacritic sensitive, so stem the word before removing diacritics.
- _stem = _stemmer.stem(_wordBuf.toString());
+ _word = _stemmer.stem(_word);
if (!(_options & kGenerateDiacriticSensitiveTokens)) {
- _tokenBuf.resetData(_stem);
- _tokenBuf.removeDiacriticsToBuf(_wordBuf);
- _stem = _wordBuf.toString();
+ // Can't use _wordbuf for output here because our input _word may point into it.
+ _word = unicode::String::caseFoldAndStripDiacritics(
+ &_finalBuf, _word, unicode::String::kCaseSensitive, _caseFoldMode);
}
return true;
@@ -112,7 +107,7 @@ bool UnicodeFTSTokenizer::moveNext() {
}
StringData UnicodeFTSTokenizer::get() const {
- return _stem;
+ return _word;
}
void UnicodeFTSTokenizer::_skipDelimiters() {
diff --git a/src/mongo/db/fts/fts_unicode_tokenizer.h b/src/mongo/db/fts/fts_unicode_tokenizer.h
index c6feeca9725..2b8c54e3e88 100644
--- a/src/mongo/db/fts/fts_unicode_tokenizer.h
+++ b/src/mongo/db/fts/fts_unicode_tokenizer.h
@@ -73,22 +73,19 @@ private:
*/
void _skipDelimiters();
- unicode::DelimiterListLanguage _delimListLanguage;
- unicode::CaseFoldMode _caseFoldMode;
-
const FTSLanguage* const _language;
const Stemmer _stemmer;
const StopWords* const _stopWords;
+ const unicode::DelimiterListLanguage _delimListLanguage;
+ const unicode::CaseFoldMode _caseFoldMode;
unicode::String _document;
size_t _pos;
-
- unicode::String _tokenBuf;
- unicode::String _wordBuf;
-
+ StringData _word;
Options _options;
- std::string _stem;
+ StackBufBuilder _wordBuf;
+ StackBufBuilder _finalBuf;
};
} // namespace fts
diff --git a/src/mongo/db/fts/stemmer.cpp b/src/mongo/db/fts/stemmer.cpp
index 07d17c050eb..19995baae4e 100644
--- a/src/mongo/db/fts/stemmer.cpp
+++ b/src/mongo/db/fts/stemmer.cpp
@@ -29,7 +29,6 @@
*/
#include <cstdlib>
-#include <string>
#include "mongo/db/fts/stemmer.h"
#include "mongo/util/mongoutils/str.h"
@@ -38,8 +37,6 @@ namespace mongo {
namespace fts {
-using std::string;
-
Stemmer::Stemmer(const FTSLanguage* language) {
_stemmer = NULL;
if (language->str() != "none")
@@ -53,9 +50,9 @@ Stemmer::~Stemmer() {
}
}
-string Stemmer::stem(StringData word) const {
+StringData Stemmer::stem(StringData word) const {
if (!_stemmer)
- return word.toString();
+ return word;
const sb_symbol* sb_sym =
sb_stemmer_stem(_stemmer, (const sb_symbol*)word.rawData(), word.size());
@@ -65,7 +62,7 @@ string Stemmer::stem(StringData word) const {
invariant(false);
}
- return string((const char*)(sb_sym), sb_stemmer_length(_stemmer));
+ return StringData((const char*)(sb_sym), sb_stemmer_length(_stemmer));
}
}
}
diff --git a/src/mongo/db/fts/stemmer.h b/src/mongo/db/fts/stemmer.h
index 59261bfb6a0..80bfdc2faf9 100644
--- a/src/mongo/db/fts/stemmer.h
+++ b/src/mongo/db/fts/stemmer.h
@@ -31,8 +31,6 @@
#pragma once
-#include <string>
-
#include "mongo/base/string_data.h"
#include "mongo/db/fts/fts_language.h"
#include "third_party/libstemmer_c/include/libstemmer.h"
@@ -53,7 +51,14 @@ public:
Stemmer(const FTSLanguage* language);
~Stemmer();
- std::string stem(StringData word) const;
+ /**
+ * Stems an input word.
+ *
+ * The returned StringData is valid until the next call to any method on this object. Since the
+ * input may be returned unmodified, the output's lifetime may also expire when the input's
+ * does.
+ */
+ StringData stem(StringData word) const;
private:
struct sb_stemmer* _stemmer;
diff --git a/src/mongo/db/fts/stop_words.cpp b/src/mongo/db/fts/stop_words.cpp
index 0a44eaf25ff..6e108db70fc 100644
--- a/src/mongo/db/fts/stop_words.cpp
+++ b/src/mongo/db/fts/stop_words.cpp
@@ -52,7 +52,7 @@ StopWords::StopWords() {}
StopWords::StopWords(const std::set<std::string>& words) {
for (std::set<std::string>::const_iterator i = words.begin(); i != words.end(); ++i)
- _words.insert(*i);
+ _words[*i] = true;
}
const StopWords* StopWords::getStopWords(const FTSLanguage* language) {
diff --git a/src/mongo/db/fts/stop_words.h b/src/mongo/db/fts/stop_words.h
index eebc11c012a..f80b638510d 100644
--- a/src/mongo/db/fts/stop_words.h
+++ b/src/mongo/db/fts/stop_words.h
@@ -35,7 +35,7 @@
#include <string>
#include "mongo/db/fts/fts_language.h"
-#include "mongo/platform/unordered_set.h"
+#include "mongo/util/string_map.h"
namespace mongo {
@@ -48,8 +48,8 @@ public:
StopWords();
StopWords(const std::set<std::string>& words);
- bool isStopWord(const std::string& word) const {
- return _words.count(word) > 0;
+ bool isStopWord(StringData word) const {
+ return _words.find(word) != _words.end();
}
size_t numStopWords() const {
@@ -59,7 +59,7 @@ public:
static const StopWords* getStopWords(const FTSLanguage* language);
private:
- unordered_set<std::string> _words;
+ StringMap<bool> _words; // Used as a set. The values have no meaning.
};
}
}
diff --git a/src/mongo/db/fts/unicode/gen_delimiter_list.py b/src/mongo/db/fts/unicode/gen_delimiter_list.py
index d8a788c74ff..3c3131a8f6b 100644
--- a/src/mongo/db/fts/unicode/gen_delimiter_list.py
+++ b/src/mongo/db/fts/unicode/gen_delimiter_list.py
@@ -58,23 +58,32 @@ def generate(unicode_proplist_file, target):
# As of Unicode 8.0.0, all of the delimiters we used for text index
# version 2 are also in the list.
-
- out.write("""bool codepointIsDelimiter(char32_t codepoint, \
-DelimiterListLanguage lang) {
- if (lang == DelimiterListLanguage::kEnglish && codepoint == '\\'') {
- return false;
- }
-
- // Most characters are latin letters, so filter those out first.
- if (codepoint >= 'A' && codepoint <= 'Z') {
- return false;
- } else if (codepoint >= 'a' && codepoint <= 'z') {
- return false;
+ out.write("static const bool englishAsciiDelimiters[128] = {\n")
+ for cp in range(0x80):
+ if cp == ord("'"):
+ out.write(" 0, // ' special case\n")
+ else:
+ out.write(" %d, // 0x%x\n" % (cp in delim_codepoints, cp))
+ out.write("};\n")
+
+ out.write("static const bool nonEnglishAsciiDelimiters[128] = {\n")
+ for cp in range(0x80):
+ out.write(" %d, // 0x%x\n" % (cp in delim_codepoints, cp))
+ out.write("};\n")
+
+ out.write("""bool codepointIsDelimiter(char32_t codepoint, DelimiterListLanguage lang) {
+ if (codepoint <= 0x7f) {
+ if (lang == DelimiterListLanguage::kEnglish) {
+ return englishAsciiDelimiters[codepoint];
+ }
+ return nonEnglishAsciiDelimiters[codepoint];
}
switch (codepoint) {\n""")
for delim in sorted(delim_codepoints):
+ if delim <= 0x7f: # ascii codepoints handled in lists above.
+ continue
out.write("\
case " + str(hex(delim)) + ": return true;\n")
diff --git a/src/mongo/db/fts/unicode/string.cpp b/src/mongo/db/fts/unicode/string.cpp
index 3218f04dbf7..10737acc3ed 100644
--- a/src/mongo/db/fts/unicode/string.cpp
+++ b/src/mongo/db/fts/unicode/string.cpp
@@ -39,6 +39,28 @@
namespace mongo {
namespace unicode {
+namespace {
+template <typename OutputIterator>
+inline void appendUtf8Codepoint(char32_t codepoint, OutputIterator* outputIt) {
+ if (codepoint <= 0x7f /* max 1-byte codepoint */) {
+ *(*outputIt)++ = (codepoint);
+ } else if (codepoint <= 0x7ff /* max 2-byte codepoint*/) {
+ *(*outputIt)++ = ((codepoint >> (6 * 1)) | 0xc0); // 2 leading 1s.
+ *(*outputIt)++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
+ } else if (codepoint <= 0xffff /* max 3-byte codepoint*/) {
+ *(*outputIt)++ = ((codepoint >> (6 * 2)) | 0xe0); // 3 leading 1s.
+ *(*outputIt)++ = (((codepoint >> (6 * 1)) & 0x3f) | 0x80);
+ *(*outputIt)++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
+ } else {
+ uassert(ErrorCodes::BadValue, "text contains invalid UTF-8", codepoint <= 0x10FFFF);
+ *(*outputIt)++ = ((codepoint >> (6 * 3)) | 0xf0); // 4 leading 1s.
+ *(*outputIt)++ = (((codepoint >> (6 * 2)) & 0x3f) | 0x80);
+ *(*outputIt)++ = (((codepoint >> (6 * 1)) & 0x3f) | 0x80);
+ *(*outputIt)++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
+ }
+}
+}
+
using linenoise_utf8::copyString32to8;
using linenoise_utf8::copyString8to32;
@@ -93,74 +115,45 @@ std::string String::toString() {
return _outputBuf;
}
-String String::substr(size_t pos, size_t len) const {
- unicode::String buf;
- substrToBuf(pos, len, buf);
- return buf;
-}
-
-String String::toLower(CaseFoldMode mode) const {
- unicode::String buf;
- toLowerToBuf(mode, buf);
- return buf;
-}
-
-String String::removeDiacritics() const {
- unicode::String buf;
- removeDiacriticsToBuf(buf);
- return buf;
-}
-
-void String::copyToBuf(String& buffer) const {
- buffer._data = _data;
- buffer._data.resize(_data.size());
- auto index = 0;
- for (auto codepoint : _data) {
- buffer._data[index++] = codepoint;
+template <typename Func>
+StringData String::substrToBufWithTransform(StackBufBuilder* buffer,
+ size_t pos,
+ size_t len,
+ Func func) const {
+ pos = std::min(pos, _data.size());
+ len = std::min(len, _data.size() - pos);
+
+ buffer->reset();
+ auto outputIt = buffer->skip(len * 4); // Reserve room for worst-case expansion.
+ auto inputIt = _data.begin() + pos;
+ for (size_t i = 0; i < len; i++) {
+ appendUtf8Codepoint(func(*inputIt++), &outputIt);
}
- buffer._needsOutputConversion = true;
+ buffer->setlen(outputIt - buffer->buf());
+ return {buffer->buf(), size_t(buffer->len())};
}
-void String::substrToBuf(size_t pos, size_t len, String& buffer) const {
- buffer._data.resize(len + 1);
- for (size_t index = 0, src_pos = pos; index < len;) {
- buffer._data[index++] = _data[src_pos++];
- }
- buffer._data[len] = '\0';
- buffer._needsOutputConversion = true;
+StringData String::substrToBuf(StackBufBuilder* buffer, size_t pos, size_t len) const {
+ const auto identityFunc = [](char32_t ch) { return ch; };
+ return substrToBufWithTransform(buffer, pos, len, identityFunc);
}
-void String::toLowerToBuf(CaseFoldMode mode, String& buffer) const {
- buffer._data.resize(_data.size());
- auto outIt = buffer._data.begin();
- for (auto codepoint : _data) {
- *outIt++ = codepointToLower(codepoint, mode);
- }
- buffer._needsOutputConversion = true;
+StringData String::toLowerToBuf(StackBufBuilder* buffer,
+ CaseFoldMode mode,
+ size_t pos,
+ size_t len) const {
+ const auto toLower = [mode](char32_t ch) { return codepointToLower(ch, mode); };
+ return substrToBufWithTransform(buffer, pos, len, toLower);
}
-void String::removeDiacriticsToBuf(String& buffer) const {
- buffer._data.resize(_data.size());
- auto outIt = buffer._data.begin();
- for (auto codepoint : _data) {
- if (codepoint <= 0x7f) {
- // ASCII only has two diacritics so they are hard-coded here.
- if (codepoint != '^' && codepoint != '`') {
- *outIt++ = codepoint;
- }
- } else if (auto clean = codepointRemoveDiacritics(codepoint)) {
- *outIt++ = clean;
- } else {
- // codepoint was a pure diacritic mark, so skip it.
- }
- }
- buffer._data.resize(outIt - buffer._data.begin());
- buffer._needsOutputConversion = true;
-}
-String::MaybeOwnedStringData String::caseFoldAndStripDiacritics(StringData utf8,
- SubstrMatchOptions options,
- CaseFoldMode mode) {
+StringData String::caseFoldAndStripDiacritics(StackBufBuilder* buffer,
+ StringData utf8,
+ SubstrMatchOptions options,
+ CaseFoldMode mode) {
+ // This fires if your input buffer the same as your output buffer.
+ invariant(buffer->buf() != utf8.rawData());
+
if ((options & kCaseSensitive) && (options & kDiacriticSensitive)) {
// No transformation needed. Just return the input data unmodified.
return utf8;
@@ -170,8 +163,8 @@ String::MaybeOwnedStringData String::caseFoldAndStripDiacritics(StringData utf8,
// and casefolding. Proof: the only case where 1 byte goes to >1 is 'I' in Turkish going to 2
// bytes. The biggest codepoint is 4 bytes which is also 2x 2 bytes. This holds as long as we
// don't map a single code point to more than one.
- std::unique_ptr<char[]> buffer(new char[utf8.size() * 2]);
- auto outputIt = buffer.get();
+ buffer->reset();
+ auto outputIt = buffer->skip(utf8.size() * 2);
for (auto inputIt = utf8.begin(), endIt = utf8.end(); inputIt != endIt;) {
#ifdef MONGO_HAVE_FAST_BYTE_VECTOR
@@ -258,25 +251,11 @@ String::MaybeOwnedStringData String::caseFoldAndStripDiacritics(StringData utf8,
}
}
- // Back to utf-8.
- if (codepoint <= 0x7f /* max 1-byte codepoint */) {
- *outputIt++ = (codepoint);
- } else if (codepoint <= 0x7ff /* max 2-byte codepoint*/) {
- *outputIt++ = ((codepoint >> (6 * 1)) | 0xc0); // 2 leading 1s.
- *outputIt++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
- } else if (codepoint <= 0xffff /* max 3-byte codepoint*/) {
- *outputIt++ = ((codepoint >> (6 * 2)) | 0xe0); // 3 leading 1s.
- *outputIt++ = (((codepoint >> (6 * 1)) & 0x3f) | 0x80);
- *outputIt++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
- } else {
- *outputIt++ = ((codepoint >> (6 * 3)) | 0xf0); // 4 leading 1s.
- *outputIt++ = (((codepoint >> (6 * 2)) & 0x3f) | 0x80);
- *outputIt++ = (((codepoint >> (6 * 1)) & 0x3f) | 0x80);
- *outputIt++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
- }
+ appendUtf8Codepoint(codepoint, &outputIt);
}
- return {std::move(buffer), outputIt};
+ buffer->setlen(outputIt - buffer->buf());
+ return {buffer->buf(), size_t(buffer->len())};
}
bool String::substrMatch(const std::string& str,
@@ -288,8 +267,10 @@ bool String::substrMatch(const std::string& str,
options &= ~kCaseSensitive;
}
- auto haystack = caseFoldAndStripDiacritics(str, options, cfMode);
- auto needle = caseFoldAndStripDiacritics(find, options, cfMode);
+ StackBufBuilder haystackBuf;
+ StackBufBuilder needleBuf;
+ auto haystack = caseFoldAndStripDiacritics(&haystackBuf, str, options, cfMode);
+ auto needle = caseFoldAndStripDiacritics(&needleBuf, find, options, cfMode);
// Case sensitive and diacritic sensitive.
return boost::algorithm::boyer_moore_search(
diff --git a/src/mongo/db/fts/unicode/string.h b/src/mongo/db/fts/unicode/string.h
index 64a0d89918b..dac83ed3b24 100644
--- a/src/mongo/db/fts/unicode/string.h
+++ b/src/mongo/db/fts/unicode/string.h
@@ -33,6 +33,7 @@
#include <string>
#include "mongo/base/string_data.h"
+#include "mongo/bson/util/builder.h"
#include "mongo/db/fts/unicode/codepoints.h"
namespace mongo {
@@ -45,34 +46,6 @@ namespace unicode {
*/
class String {
public:
- /**
- * A StringData that may own its own buffer.
- */
- class MaybeOwnedStringData : public StringData {
- public:
- /**
- * Makes an empty, unowned string.
- */
- MaybeOwnedStringData() = default;
-
- /**
- * Makes an owned string.
- */
- MaybeOwnedStringData(std::unique_ptr<char[]>&& buffer, const char* endIt)
- : StringData(buffer.get(), endIt - buffer.get()), _buffer(std::move(buffer)) {}
-
- /**
- * Makes an unowned string.
- */
- /*implicit*/ MaybeOwnedStringData(StringData str) : StringData(str) {}
- MaybeOwnedStringData& operator=(StringData str) {
- return (*this = MaybeOwnedStringData(str));
- }
-
- private:
- std::unique_ptr<char[]> _buffer;
- };
-
String() = default;
#if defined(_MSC_VER) && _MSC_VER < 1900
@@ -96,39 +69,19 @@ public:
void resetData(const StringData utf8_src);
/**
- * Return a lowercased version of the String instance.
- */
- String toLower(CaseFoldMode mode = CaseFoldMode::kNormal) const;
-
- /**
- * Returns a version of the String instance with diacritics and combining marks removed.
- */
- String removeDiacritics() const;
-
- /**
- * Returns a substring of the String instance, using the same semantics as std::string::substr.
- */
- String substr(size_t begin, size_t end) const;
-
- /**
- * Copies the current String to another String.
- */
- void copyToBuf(String& buffer) const;
-
- /**
* Takes a substring of the current String and puts it in another String.
+ * Overwrites buffer's previous contents rather than appending.
*/
- void substrToBuf(size_t pos, size_t len, String& buffer) const;
-
- /**
- * Lowercases the current String and stores the result in another String.
- */
- void toLowerToBuf(CaseFoldMode mode, String& buffer) const;
+ StringData substrToBuf(StackBufBuilder* buffer, size_t pos, size_t len) const;
/**
- * Removes diacritics from the current String and stores the result in another String.
+ * Lowercases a substring of the current String and stores the UTF8 result in buffer.
+ * Overwrites buffer's previous contents rather than appending.
*/
- void removeDiacriticsToBuf(String& buffer) const;
+ StringData toLowerToBuf(StackBufBuilder* buffer,
+ CaseFoldMode mode,
+ size_t offset = 0,
+ size_t len = std::string::npos) const;
/**
* Returns a UTF-8 encoded std::string version of the String instance. Uses the conversion
@@ -183,10 +136,20 @@ public:
/**
* Strips diacritics and case-folds the utf8 input string, as needed to support options.
- */
- static MaybeOwnedStringData caseFoldAndStripDiacritics(StringData utf8,
- SubstrMatchOptions options,
- CaseFoldMode mode);
+ *
+ * The options field specifies what operations to *skip*, so kCaseSensitive means to skip case
+ * folding and kDiacriticSensitive means to skip diacritic striping. If both flags are
+ * specified, the input utf8 StringData is returned directly without any processing or copying.
+ *
+ * If processing is performed, the returned StringData will be placed in buffer. buffer's
+ * contents (if any) will be replaced. Since we may return the input unmodified the returned
+ * StringData's lifetime is the shorter of the input utf8 and the next modification to buffer.
+ * The input utf8 must not point into buffer.
+ */
+ static StringData caseFoldAndStripDiacritics(StackBufBuilder* buffer,
+ StringData utf8,
+ SubstrMatchOptions options,
+ CaseFoldMode mode);
private:
/**
@@ -195,6 +158,15 @@ private:
void setData(const StringData utf8_src);
/**
+ * Unified implementation of substrToBuf and toLowerToBuf.
+ */
+ template <typename Func>
+ StringData substrToBufWithTransform(StackBufBuilder* buffer,
+ size_t pos,
+ size_t len,
+ Func transform) const;
+
+ /**
* The underlying UTF-32 data.
*/
std::u32string _data;
diff --git a/src/mongo/db/fts/unicode/string_test.cpp b/src/mongo/db/fts/unicode/string_test.cpp
index d627120e9e7..e67228f76de 100644
--- a/src/mongo/db/fts/unicode/string_test.cpp
+++ b/src/mongo/db/fts/unicode/string_test.cpp
@@ -61,19 +61,30 @@ auto kNormal = CaseFoldMode::kNormal;
// Macro to preserve line numbers and arguments in error messages.
-#define TEST_CASE_FOLD_AND_STRIP_DIACRITICS(expected, input, options, caseFoldMode) \
- ASSERT_EQ(expected, String::caseFoldAndStripDiacritics(input, options, caseFoldMode)); \
- ASSERT_EQ(expected + filler, \
- String::caseFoldAndStripDiacritics(input + filler, options, caseFoldMode))
+#define TEST_CASE_FOLD_AND_STRIP_DIACRITICS(expected, input, options, caseFoldMode) \
+ do { \
+ StackBufBuilder buf; \
+ ASSERT_EQ(expected, \
+ String::caseFoldAndStripDiacritics(&buf, input, options, caseFoldMode)); \
+ ASSERT_EQ( \
+ expected + filler, \
+ String::caseFoldAndStripDiacritics(&buf, input + filler, options, caseFoldMode)); \
+ } while (0)
+
+TEST(UnicodeString, SubstrTest) {
+ StackBufBuilder buf;
+ String indexes("01234");
+ ASSERT_EQ("123", indexes.substrToBuf(&buf, 1, 3));
+ ASSERT_EQ("4", indexes.substrToBuf(&buf, 4, 3)); // len too long.
+ ASSERT_EQ("", indexes.substrToBuf(&buf, 6, 3)); // pos past end.
+ ASSERT_EQ("", indexes.substrToBuf(&buf, 1, 0)); // len == 0.
+}
TEST(UnicodeString, RemoveDiacritics) {
// Test all ascii chars.
for (unsigned char ch = 0; ch <= 0x7F; ch++) {
const auto input = std::string(1, ch);
const auto output = codepointIsDiacritic(ch) ? std::string() : std::string(1, ch);
- if (ch) { // String's constructor doesn't handle embedded NUL bytes.
- ASSERT_EQUALS(output, String(input).removeDiacritics().toString());
- }
TEST_CASE_FOLD_AND_STRIP_DIACRITICS(output, input, kCaseSensitive, kNormal);
}
@@ -83,21 +94,20 @@ TEST(UnicodeString, RemoveDiacritics) {
// NFD Normalized Text ("Café").
const char test2[] = {'C', 'a', 'f', 'e', static_cast<char>(0xcc), static_cast<char>(0x81), 0};
- ASSERT_EQUALS(UTF8("¿CUANTOS ANOS TIENES TU?"), String(test1).removeDiacritics().toString());
- ASSERT_EQUALS(UTF8("Cafe"), String(test2).removeDiacritics().toString());
-
TEST_CASE_FOLD_AND_STRIP_DIACRITICS(
UTF8("¿CUANTOS ANOS TIENES TU?"), test1, kCaseSensitive, kNormal);
TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("Cafe"), test2, kCaseSensitive, kNormal);
}
TEST(UnicodeString, CaseFolding) {
+ StackBufBuilder buf;
+
// Test all ascii chars.
for (unsigned char ch = 0; ch <= 0x7F; ch++) {
const auto upper = std::string(1, ch);
const auto lower = std::string(1, std::tolower(ch));
if (ch) { // String's constructor doesn't handle embedded NUL bytes.
- ASSERT_EQUALS(lower, String(upper).toLower().toString());
+ ASSERT_EQUALS(lower, String(upper).toLowerToBuf(&buf, kNormal));
}
TEST_CASE_FOLD_AND_STRIP_DIACRITICS(lower, upper, kDiacriticSensitive, kNormal);
}
@@ -105,8 +115,8 @@ TEST(UnicodeString, CaseFolding) {
const char test1[] = UTF8("СКОЛЬКО ТЕБЕ ЛЕТ?");
const char test2[] = UTF8("¿CUÁNTOS AÑOS TIENES TÚ?");
- ASSERT_EQUALS(UTF8("сколько тебе лет?"), String(test1).toLower().toString());
- ASSERT_EQUALS(UTF8("¿cuántos años tienes tú?"), String(test2).toLower().toString());
+ ASSERT_EQUALS(UTF8("сколько тебе лет?"), String(test1).toLowerToBuf(&buf, kNormal));
+ ASSERT_EQUALS(UTF8("¿cuántos años tienes tú?"), String(test2).toLowerToBuf(&buf, kNormal));
TEST_CASE_FOLD_AND_STRIP_DIACRITICS(
UTF8("сколько тебе лет?"), test1, kDiacriticSensitive, kNormal);
@@ -115,13 +125,14 @@ TEST(UnicodeString, CaseFolding) {
}
TEST(UnicodeString, CaseFoldingTurkish) {
+ StackBufBuilder buf;
const char test1[] = UTF8("KAC YASINDASINIZ");
const char test2[] = UTF8("KAC YASİNDASİNİZ");
ASSERT_EQUALS(UTF8("kac yasındasınız"),
- String(test1).toLower(CaseFoldMode::kTurkish).toString());
+ String(test1).toLowerToBuf(&buf, CaseFoldMode::kTurkish));
ASSERT_EQUALS(UTF8("kac yasindasiniz"),
- String(test2).toLower(CaseFoldMode::kTurkish).toString());
+ String(test2).toLowerToBuf(&buf, CaseFoldMode::kTurkish));
TEST_CASE_FOLD_AND_STRIP_DIACRITICS(
UTF8("kac yasındasınız"), test1, kDiacriticSensitive, kTurkish);
@@ -137,12 +148,6 @@ TEST(UnicodeString, CaseFoldingAndRemoveDiacritics) {
// NFD Normalized Text ("CAFÉ").
const char test3[] = {'C', 'A', 'F', 'E', static_cast<char>(0xcc), static_cast<char>(0x81), 0};
- ASSERT_EQUALS(UTF8("ποσο χρονων εισαι?"),
- String(test1).toLower().removeDiacritics().toString());
- ASSERT_EQUALS(UTF8("¿cuantos anos tienes tu?"),
- String(test2).toLower().removeDiacritics().toString());
- ASSERT_EQUALS(UTF8("cafe"), String(test3).toLower().removeDiacritics().toString());
-
TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("ποσο χρονων εισαι?"), test1, 0, kNormal);
TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("¿cuantos anos tienes tu?"), test2, 0, kNormal);
TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("cafe"), test3, 0, kNormal);
@@ -214,13 +219,16 @@ TEST(UnicodeString, BadUTF8) {
ASSERT_THROWS(String test3(invalid3), AssertionException);
ASSERT_THROWS(String test4(invalid4), AssertionException);
+ StackBufBuilder buf;
+
// caseFoldAndStripDiacritics doesn't make any guarantees about behavior when fed invalid utf8.
// These calls are to ensure that they don't trigger any faults in sanitizing builds.
- String::caseFoldAndStripDiacritics(invalid1, 0, kNormal);
- String::caseFoldAndStripDiacritics(invalid2, 0, kNormal);
- String::caseFoldAndStripDiacritics(invalid3, 0, kNormal);
+ String::caseFoldAndStripDiacritics(&buf, invalid1, 0, kNormal);
+ String::caseFoldAndStripDiacritics(&buf, invalid2, 0, kNormal);
+ String::caseFoldAndStripDiacritics(&buf, invalid3, 0, kNormal);
- ASSERT_THROWS(String::caseFoldAndStripDiacritics(invalid4, 0, kNormal), AssertionException);
+ ASSERT_THROWS(String::caseFoldAndStripDiacritics(&buf, invalid4, 0, kNormal),
+ AssertionException);
}
TEST(UnicodeString, UTF32ToUTF8) {