summaryrefslogtreecommitdiff
path: root/src/mongo/db/fts
diff options
context:
space:
mode:
authorAdam Chelminski <adam.chelminski@mongodb.com>2015-08-13 11:16:35 -0400
committerAdam Chelminski <adam.chelminski@mongodb.com>2015-08-14 13:50:48 -0400
commita0bbce24216ee2a8a30ef606a76c663d61aacff0 (patch)
tree0f7cdbd9d385ace60abbcd5834636ca269df219d /src/mongo/db/fts
parent3211eea8dbfe317ad3e1434abf1a4cd7190a1b1c (diff)
downloadmongo-a0bbce24216ee2a8a30ef606a76c663d61aacff0.tar.gz
SERVER-19944 Improve text index v3 performance
Diffstat (limited to 'src/mongo/db/fts')
-rw-r--r--src/mongo/db/fts/fts_unicode_tokenizer.cpp17
-rw-r--r--src/mongo/db/fts/fts_unicode_tokenizer.h3
-rw-r--r--src/mongo/db/fts/unicode/gen_delimiter_list.py7
-rw-r--r--src/mongo/db/fts/unicode/string.cpp87
-rw-r--r--src/mongo/db/fts/unicode/string.h38
5 files changed, 120 insertions, 32 deletions
diff --git a/src/mongo/db/fts/fts_unicode_tokenizer.cpp b/src/mongo/db/fts/fts_unicode_tokenizer.cpp
index 8cdce180dea..a5460448505 100644
--- a/src/mongo/db/fts/fts_unicode_tokenizer.cpp
+++ b/src/mongo/db/fts/fts_unicode_tokenizer.cpp
@@ -62,7 +62,7 @@ UnicodeFTSTokenizer::UnicodeFTSTokenizer(const FTSLanguage* language)
void UnicodeFTSTokenizer::reset(StringData document, Options options) {
_options = options;
_pos = 0;
- _document = unicode::String(document);
+ _document.resetData(document);
// Skip any leading delimiters (and handle the case where the document is entirely delimiters).
_skipDelimiters();
@@ -81,29 +81,30 @@ bool UnicodeFTSTokenizer::moveNext() {
(!unicode::codepointIsDelimiter(_document[_pos], _delimListLanguage))) {
++_pos;
}
- unicode::String token = _document.substr(start, _pos - start);
+ _document.substrToBuf(start, _pos - start, _tokenBuf);
// Skip the delimiters before the next token.
_skipDelimiters();
// Stop words are case-sensitive and diacritic sensitive, so we need them to be lower cased
// but with diacritics not removed to check against the stop word list.
- unicode::String word = token.toLower(_caseFoldMode);
+ _tokenBuf.toLowerToBuf(_caseFoldMode, _wordBuf);
- if ((_options & kFilterStopWords) && _stopWords->isStopWord(word.toString())) {
+ if ((_options & kFilterStopWords) && _stopWords->isStopWord(_wordBuf.toString())) {
continue;
}
if (_options & kGenerateCaseSensitiveTokens) {
- word = token;
+ _tokenBuf.copyToBuf(_wordBuf);
}
// The stemmer is diacritic sensitive, so stem the word before removing diacritics.
- _stem = _stemmer.stem(word.toString());
+ _stem = _stemmer.stem(_wordBuf.toString());
if (!(_options & kGenerateDiacriticSensitiveTokens)) {
- token.resetData(_stem);
- _stem = token.removeDiacritics().toString();
+ _tokenBuf.resetData(_stem);
+ _tokenBuf.removeDiacriticsToBuf(_wordBuf);
+ _stem = _wordBuf.toString();
}
return true;
diff --git a/src/mongo/db/fts/fts_unicode_tokenizer.h b/src/mongo/db/fts/fts_unicode_tokenizer.h
index 0312ffc300b..c6feeca9725 100644
--- a/src/mongo/db/fts/fts_unicode_tokenizer.h
+++ b/src/mongo/db/fts/fts_unicode_tokenizer.h
@@ -83,6 +83,9 @@ private:
unicode::String _document;
size_t _pos;
+ unicode::String _tokenBuf;
+ unicode::String _wordBuf;
+
Options _options;
std::string _stem;
diff --git a/src/mongo/db/fts/unicode/gen_delimiter_list.py b/src/mongo/db/fts/unicode/gen_delimiter_list.py
index 52b79544c6b..5b678400e19 100644
--- a/src/mongo/db/fts/unicode/gen_delimiter_list.py
+++ b/src/mongo/db/fts/unicode/gen_delimiter_list.py
@@ -65,6 +65,13 @@ DelimiterListLanguage lang) {
return false;
}
+ // Most characters are latin letters, so filter those out first.
+ if (codepoint >= 'A' && codepoint <= 'Z') {
+ return false;
+ } else if (codepoint >= 'a' && codepoint <= 'z') {
+ return false;
+ }
+
switch (codepoint) {\n""")
for delim in sorted(delim_codepoints):
diff --git a/src/mongo/db/fts/unicode/string.cpp b/src/mongo/db/fts/unicode/string.cpp
index 9f29749edbc..9c18a776787 100644
--- a/src/mongo/db/fts/unicode/string.cpp
+++ b/src/mongo/db/fts/unicode/string.cpp
@@ -42,10 +42,16 @@ using linenoise_utf8::copyString8to32;
using std::u32string;
String::String(const StringData utf8_src) {
+ // Reserve space for underlying buffers to prevent excessive reallocations.
+ _outputBuf.reserve(utf8_src.size() * 4);
+ _data.reserve(utf8_src.size() * 4);
+
+ // Convert UTF-8 input to UTF-32 data.
setData(utf8_src);
}
void String::resetData(const StringData utf8_src) {
+ // Convert UTF-8 input to UTF-32 data.
setData(utf8_src);
}
@@ -70,20 +76,28 @@ void String::setData(const StringData utf8_src) {
// Resize _data so it is only as big as what it contains.
_data.resize(resultSize);
+ _needsOutputConversion = true;
}
-String::String(u32string&& src) : _data(std::move(src)) {}
-
-std::string String::toString() const {
- // output is the target, resize it so that it's guaranteed to fit all of the input characters,
- // plus a null character if there isn't one.
- std::string output(_data.size() * 4 + 1, '\0');
- size_t resultSize =
- copyString32to8(reinterpret_cast<unsigned char*>(&output[0]), &_data[0], output.size());
+String::String(u32string&& src) : _data(std::move(src)), _needsOutputConversion(true) {
+ // Reserve space for underlying buffers to prevent excessive reallocations.
+ _outputBuf.reserve(src.size() * 4);
+ _data.reserve(src.size() * 4);
+}
- // Resize output so it is only as large as what it contains.
- output.resize(resultSize);
- return output;
+std::string String::toString() {
+ // _outputBuf is the target, resize it so that it's guaranteed to fit all of the input
+ // characters, plus a null character if there isn't one.
+ if (_needsOutputConversion) {
+ _outputBuf.resize(_data.size() * 4 + 1);
+ size_t resultSize = copyString32to8(
+ reinterpret_cast<unsigned char*>(&_outputBuf[0]), &_data[0], _outputBuf.size());
+
+ // Resize output so it is only as large as what it contains.
+ _outputBuf.resize(resultSize);
+ _needsOutputConversion = false;
+ }
+ return _outputBuf;
}
size_t String::size() const {
@@ -95,30 +109,61 @@ const char32_t& String::operator[](int i) const {
}
String String::substr(size_t pos, size_t len) const {
- return String(_data.substr(pos, len));
+ unicode::String buf;
+ substrToBuf(pos, len, buf);
+ return buf;
}
String String::toLower(CaseFoldMode mode) const {
- u32string newdata(_data.size(), 0);
+ unicode::String buf;
+ toLowerToBuf(mode, buf);
+ return buf;
+}
+
+String String::removeDiacritics() const {
+ unicode::String buf;
+ removeDiacriticsToBuf(buf);
+ return buf;
+}
+
+void String::copyToBuf(String& buffer) const {
+ buffer._data = _data;
+ buffer._data.resize(_data.size());
auto index = 0;
for (auto codepoint : _data) {
- newdata[index++] = codepointToLower(codepoint, mode);
+ buffer._data[index++] = codepoint;
}
+ buffer._needsOutputConversion = true;
+}
- return String(std::move(newdata));
+void String::substrToBuf(size_t pos, size_t len, String& buffer) const {
+ buffer._data.resize(len + 1);
+ for (size_t index = 0, src_pos = pos; index < len;) {
+ buffer._data[index++] = _data[src_pos++];
+ }
+ buffer._data[len] = '\0';
+ buffer._needsOutputConversion = true;
}
-String String::removeDiacritics() const {
- u32string newdata(_data.size(), 0);
+void String::toLowerToBuf(CaseFoldMode mode, String& buffer) const {
+ buffer._data.resize(_data.size());
+ auto index = 0;
+ for (auto codepoint : _data) {
+ buffer._data[index++] = codepointToLower(codepoint, mode);
+ }
+ buffer._needsOutputConversion = true;
+}
+
+void String::removeDiacriticsToBuf(String& buffer) const {
+ buffer._data.resize(_data.size());
auto index = 0;
for (auto codepoint : _data) {
if (!codepointIsDiacritic(codepoint)) {
- newdata[index++] = codepointRemoveDiacritics(codepoint);
+ buffer._data[index++] = codepointRemoveDiacritics(codepoint);
}
}
-
- newdata.resize(index);
- return String(std::move(newdata));
+ buffer._data.resize(index);
+ buffer._needsOutputConversion = true;
}
bool String::substrMatch(const String& str,
diff --git a/src/mongo/db/fts/unicode/string.h b/src/mongo/db/fts/unicode/string.h
index ddfa6f93870..c3355ee4f25 100644
--- a/src/mongo/db/fts/unicode/string.h
+++ b/src/mongo/db/fts/unicode/string.h
@@ -67,7 +67,7 @@ public:
void resetData(const StringData utf8_src);
/**
- * Return a lowercased version of the String instance using the Unicode data in u_data.h.
+ * Return a lowercased version of the String instance.
*/
String toLower(CaseFoldMode mode = CaseFoldMode::kNormal) const;
@@ -82,9 +82,30 @@ public:
String substr(size_t begin, size_t end) const;
/**
- * Returns a UTF-8 encoded std::string version of the String instance.
+ * Copies the current String to another String.
*/
- std::string toString() const;
+ void copyToBuf(String& buffer) const;
+
+ /**
+ * Takes a substring of the current String and puts it in another String.
+ */
+ void substrToBuf(size_t pos, size_t len, String& buffer) const;
+
+ /**
+ * Lowercases the current String and stores the result in another String.
+ */
+ void toLowerToBuf(CaseFoldMode mode, String& buffer) const;
+
+ /**
+ * Removes diacritics from the current String and stores the result in another String.
+ */
+ void removeDiacriticsToBuf(String& buffer) const;
+
+ /**
+ * Returns a UTF-8 encoded std::string version of the String instance. Uses the conversion
+ * stored in the output buffer when possible.
+ */
+ std::string toString();
/**
* Returns the number Unicode codepoints in the String.
@@ -143,6 +164,17 @@ private:
* The underlying UTF-32 data.
*/
std::u32string _data;
+
+ /**
+ * A buffer for storing the result of the UTF-32 to UTF-8 conversion.
+ */
+ std::string _outputBuf;
+
+ /**
+ * A bool flag that is set to true when toString() will require that the UTF-32 to UTF-8
+ * conversion be applied again.
+ */
+ bool _needsOutputConversion;
};
} // namespace unicode