diff options
author | Mathias Stearn <mathias@10gen.com> | 2016-02-24 16:35:54 -0500 |
---|---|---|
committer | Mathias Stearn <mathias@10gen.com> | 2016-03-11 08:50:18 -0500 |
commit | 6c3157f126bb44ab275325e85de7abee5ce9ad6d (patch) | |
tree | 16189ba18d0febfdd564006c1f03f008527f4f41 /src/mongo/db/fts/unicode/string.h | |
parent | 4a35c7184e188354793f16d27e2330b3b5ce7f8f (diff) | |
download | mongo-6c3157f126bb44ab275325e85de7abee5ce9ad6d.tar.gz |
SERVER-19936 Optimize FTS v3 phrase matching
Major changes:
* Use Booyer-Moore algorithm for searching rather than std::search
* All strings are kept in UTF8 rather than going to UTF32.
* Case folding and diacritic removal are done in a single pass.
* Optimize case folding and diacritic removal for ASCII codepoints.
* Combine functionality of codepointIsDiacritic() into
codepointRemoveDiacritics()
Diffstat (limited to 'src/mongo/db/fts/unicode/string.h')
-rw-r--r-- | src/mongo/db/fts/unicode/string.h | 25 |
1 files changed, 17 insertions, 8 deletions
diff --git a/src/mongo/db/fts/unicode/string.h b/src/mongo/db/fts/unicode/string.h index c3355ee4f25..1ba6e46c27c 100644 --- a/src/mongo/db/fts/unicode/string.h +++ b/src/mongo/db/fts/unicode/string.h @@ -29,6 +29,7 @@ #pragma once #include <cstdint> +#include <memory> #include <string> #include "mongo/base/string_data.h" @@ -110,12 +111,16 @@ public: /** * Returns the number Unicode codepoints in the String. */ - size_t size() const; + size_t size() const { + return _data.size(); + } /** * Returns the Unicode codepoint at index i of the String. */ - const char32_t& operator[](int i) const; + const char32_t& operator[](int i) const { + return _data[i]; + } /** * Options for the substrMatch method. @@ -143,18 +148,22 @@ public: * the search is case insensitive, non-Turkish case folding is used unless the * CaseFoldMode::Turkish is passed to mode. */ - static bool substrMatch(const String& str, - const String& find, + static bool substrMatch(const std::string& str, + const std::string& find, SubstrMatchOptions options, CaseFoldMode mode = CaseFoldMode::kNormal); -private: /** - * Private constructor used by substr, toLower, and removeDiacritics to build a String from - * UTF-32 data. + * Strips diacritics and case-folds the utf8 input string, as needed to support options. + * + * Returns an owned buffer containing the output utf8 string and an end iterator for the string + * (points at the first byte after the string). */ - String(std::u32string&& src); + static std::pair<std::unique_ptr<char[]>, char*> prepForSubstrMatch(StringData utf8, + SubstrMatchOptions options, + CaseFoldMode mode); +private: /** * Helper method for converting a UTF-8 string to a UTF-32 string. */ |