summaryrefslogtreecommitdiff
path: root/src/mongo/db/fts/unicode/string.h
diff options
context:
space:
mode:
authorMathias Stearn <mathias@10gen.com>2016-02-24 16:35:54 -0500
committerMathias Stearn <mathias@10gen.com>2016-03-11 08:50:18 -0500
commit6c3157f126bb44ab275325e85de7abee5ce9ad6d (patch)
tree16189ba18d0febfdd564006c1f03f008527f4f41 /src/mongo/db/fts/unicode/string.h
parent4a35c7184e188354793f16d27e2330b3b5ce7f8f (diff)
downloadmongo-6c3157f126bb44ab275325e85de7abee5ce9ad6d.tar.gz
SERVER-19936 Optimize FTS v3 phrase matching
Major changes: * Use Booyer-Moore algorithm for searching rather than std::search * All strings are kept in UTF8 rather than going to UTF32. * Case folding and diacritic removal are done in a single pass. * Optimize case folding and diacritic removal for ASCII codepoints. * Combine functionality of codepointIsDiacritic() into codepointRemoveDiacritics()
Diffstat (limited to 'src/mongo/db/fts/unicode/string.h')
-rw-r--r--src/mongo/db/fts/unicode/string.h25
1 files changed, 17 insertions, 8 deletions
diff --git a/src/mongo/db/fts/unicode/string.h b/src/mongo/db/fts/unicode/string.h
index c3355ee4f25..1ba6e46c27c 100644
--- a/src/mongo/db/fts/unicode/string.h
+++ b/src/mongo/db/fts/unicode/string.h
@@ -29,6 +29,7 @@
#pragma once
#include <cstdint>
+#include <memory>
#include <string>
#include "mongo/base/string_data.h"
@@ -110,12 +111,16 @@ public:
/**
* Returns the number Unicode codepoints in the String.
*/
- size_t size() const;
+ size_t size() const {
+ return _data.size();
+ }
/**
* Returns the Unicode codepoint at index i of the String.
*/
- const char32_t& operator[](int i) const;
+ const char32_t& operator[](int i) const {
+ return _data[i];
+ }
/**
* Options for the substrMatch method.
@@ -143,18 +148,22 @@ public:
* the search is case insensitive, non-Turkish case folding is used unless the
* CaseFoldMode::Turkish is passed to mode.
*/
- static bool substrMatch(const String& str,
- const String& find,
+ static bool substrMatch(const std::string& str,
+ const std::string& find,
SubstrMatchOptions options,
CaseFoldMode mode = CaseFoldMode::kNormal);
-private:
/**
- * Private constructor used by substr, toLower, and removeDiacritics to build a String from
- * UTF-32 data.
+ * Strips diacritics and case-folds the utf8 input string, as needed to support options.
+ *
+ * Returns an owned buffer containing the output utf8 string and an end iterator for the string
+ * (points at the first byte after the string).
*/
- String(std::u32string&& src);
+ static std::pair<std::unique_ptr<char[]>, char*> prepForSubstrMatch(StringData utf8,
+ SubstrMatchOptions options,
+ CaseFoldMode mode);
+private:
/**
* Helper method for converting a UTF-8 string to a UTF-32 string.
*/