diff options
author | Mathias Stearn <mathias@10gen.com> | 2016-02-29 12:49:58 -0500 |
---|---|---|
committer | Mathias Stearn <mathias@10gen.com> | 2016-03-11 08:50:18 -0500 |
commit | 35f4f2f5a58e5dc90b583e8bc6089eaa2d83e065 (patch) | |
tree | 7be60b0a571c00be6bf65724bfd981d5e2a9f400 | |
parent | 67eee08bb606537df7417670d423c0527dd6221f (diff) | |
download | mongo-35f4f2f5a58e5dc90b583e8bc6089eaa2d83e065.tar.gz |
SERVER-19936 Rename unicode::string::prepForSubstrMatch and make easier to use
-rw-r--r-- | src/mongo/db/fts/unicode/string.cpp | 25 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/string.h | 37 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/string_test.cpp | 52 |
3 files changed, 65 insertions, 49 deletions
diff --git a/src/mongo/db/fts/unicode/string.cpp b/src/mongo/db/fts/unicode/string.cpp index 483cb7b22eb..3218f04dbf7 100644 --- a/src/mongo/db/fts/unicode/string.cpp +++ b/src/mongo/db/fts/unicode/string.cpp @@ -158,11 +158,13 @@ void String::removeDiacriticsToBuf(String& buffer) const { buffer._needsOutputConversion = true; } -std::pair<std::unique_ptr<char[]>, char*> String::prepForSubstrMatch(StringData utf8, - SubstrMatchOptions options, - CaseFoldMode mode) { - // This function should only be called when casefolding or stripping diacritics. - dassert(!(options & kCaseSensitive) || !(options & kDiacriticSensitive)); +String::MaybeOwnedStringData String::caseFoldAndStripDiacritics(StringData utf8, + SubstrMatchOptions options, + CaseFoldMode mode) { + if ((options & kCaseSensitive) && (options & kDiacriticSensitive)) { + // No transformation needed. Just return the input data unmodified. + return utf8; + } // Allocate space for up to 2x growth which is the worst possible case for stripping diacritics // and casefolding. Proof: the only case where 1 byte goes to >1 is 'I' in Turkish going to 2 @@ -286,19 +288,12 @@ bool String::substrMatch(const std::string& str, options &= ~kCaseSensitive; } - if ((options & kCaseSensitive) && (options & kDiacriticSensitive)) { - // No transformation needed. Just do the search on the input strings. - return boost::algorithm::boyer_moore_search( - str.cbegin(), str.cend(), find.cbegin(), find.cend()) != str.cend(); - } - - auto haystack = prepForSubstrMatch(str, options, cfMode); - auto needle = prepForSubstrMatch(find, options, cfMode); + auto haystack = caseFoldAndStripDiacritics(str, options, cfMode); + auto needle = caseFoldAndStripDiacritics(find, options, cfMode); // Case sensitive and diacritic sensitive. return boost::algorithm::boyer_moore_search( - haystack.first.get(), haystack.second, needle.first.get(), needle.second) != - haystack.second; + haystack.begin(), haystack.end(), needle.begin(), needle.end()) != haystack.end(); } } // namespace unicode diff --git a/src/mongo/db/fts/unicode/string.h b/src/mongo/db/fts/unicode/string.h index 1ba6e46c27c..64a0d89918b 100644 --- a/src/mongo/db/fts/unicode/string.h +++ b/src/mongo/db/fts/unicode/string.h @@ -45,6 +45,34 @@ namespace unicode { */ class String { public: + /** + * A StringData that may own its own buffer. + */ + class MaybeOwnedStringData : public StringData { + public: + /** + * Makes an empty, unowned string. + */ + MaybeOwnedStringData() = default; + + /** + * Makes an owned string. + */ + MaybeOwnedStringData(std::unique_ptr<char[]>&& buffer, const char* endIt) + : StringData(buffer.get(), endIt - buffer.get()), _buffer(std::move(buffer)) {} + + /** + * Makes an unowned string. + */ + /*implicit*/ MaybeOwnedStringData(StringData str) : StringData(str) {} + MaybeOwnedStringData& operator=(StringData str) { + return (*this = MaybeOwnedStringData(str)); + } + + private: + std::unique_ptr<char[]> _buffer; + }; + String() = default; #if defined(_MSC_VER) && _MSC_VER < 1900 @@ -155,13 +183,10 @@ public: /** * Strips diacritics and case-folds the utf8 input string, as needed to support options. - * - * Returns an owned buffer containing the output utf8 string and an end iterator for the string - * (points at the first byte after the string). */ - static std::pair<std::unique_ptr<char[]>, char*> prepForSubstrMatch(StringData utf8, - SubstrMatchOptions options, - CaseFoldMode mode); + static MaybeOwnedStringData caseFoldAndStripDiacritics(StringData utf8, + SubstrMatchOptions options, + CaseFoldMode mode); private: /** diff --git a/src/mongo/db/fts/unicode/string_test.cpp b/src/mongo/db/fts/unicode/string_test.cpp index 8dec9a7e8df..d627120e9e7 100644 --- a/src/mongo/db/fts/unicode/string_test.cpp +++ b/src/mongo/db/fts/unicode/string_test.cpp @@ -52,14 +52,6 @@ namespace { // implementation. const std::string filler(32, 'x'); -/** - * Converts return from prepForSubstrMatch to a StringData that is only useful in the current - * expression. - */ -StringData toStringData(std::pair<std::unique_ptr<char[]>, char*> result) { - return {result.first.get(), size_t(result.second - result.first.get())}; -} - auto kDiacriticSensitive = String::kDiacriticSensitive; auto kCaseSensitive = String::kCaseSensitive; @@ -69,10 +61,10 @@ auto kNormal = CaseFoldMode::kNormal; // Macro to preserve line numbers and arguments in error messages. -#define TEST_PREP_FOR_SUBSTR_MATCH(expected, input, options, caseFoldMode) \ - ASSERT_EQ(expected, toStringData(String::prepForSubstrMatch(input, options, caseFoldMode))); \ - ASSERT_EQ(expected + filler, \ - toStringData(String::prepForSubstrMatch(input + filler, options, caseFoldMode))) +#define TEST_CASE_FOLD_AND_STRIP_DIACRITICS(expected, input, options, caseFoldMode) \ + ASSERT_EQ(expected, String::caseFoldAndStripDiacritics(input, options, caseFoldMode)); \ + ASSERT_EQ(expected + filler, \ + String::caseFoldAndStripDiacritics(input + filler, options, caseFoldMode)) TEST(UnicodeString, RemoveDiacritics) { // Test all ascii chars. @@ -82,7 +74,7 @@ TEST(UnicodeString, RemoveDiacritics) { if (ch) { // String's constructor doesn't handle embedded NUL bytes. ASSERT_EQUALS(output, String(input).removeDiacritics().toString()); } - TEST_PREP_FOR_SUBSTR_MATCH(output, input, kCaseSensitive, kNormal); + TEST_CASE_FOLD_AND_STRIP_DIACRITICS(output, input, kCaseSensitive, kNormal); } // NFC Normalized Text. @@ -94,8 +86,9 @@ TEST(UnicodeString, RemoveDiacritics) { ASSERT_EQUALS(UTF8("¿CUANTOS ANOS TIENES TU?"), String(test1).removeDiacritics().toString()); ASSERT_EQUALS(UTF8("Cafe"), String(test2).removeDiacritics().toString()); - TEST_PREP_FOR_SUBSTR_MATCH(UTF8("¿CUANTOS ANOS TIENES TU?"), test1, kCaseSensitive, kNormal); - TEST_PREP_FOR_SUBSTR_MATCH(UTF8("Cafe"), test2, kCaseSensitive, kNormal); + TEST_CASE_FOLD_AND_STRIP_DIACRITICS( + UTF8("¿CUANTOS ANOS TIENES TU?"), test1, kCaseSensitive, kNormal); + TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("Cafe"), test2, kCaseSensitive, kNormal); } TEST(UnicodeString, CaseFolding) { @@ -106,7 +99,7 @@ TEST(UnicodeString, CaseFolding) { if (ch) { // String's constructor doesn't handle embedded NUL bytes. ASSERT_EQUALS(lower, String(upper).toLower().toString()); } - TEST_PREP_FOR_SUBSTR_MATCH(lower, upper, kDiacriticSensitive, kNormal); + TEST_CASE_FOLD_AND_STRIP_DIACRITICS(lower, upper, kDiacriticSensitive, kNormal); } const char test1[] = UTF8("СКОЛЬКО ТЕБЕ ЛЕТ?"); @@ -115,8 +108,9 @@ TEST(UnicodeString, CaseFolding) { ASSERT_EQUALS(UTF8("сколько тебе лет?"), String(test1).toLower().toString()); ASSERT_EQUALS(UTF8("¿cuántos años tienes tú?"), String(test2).toLower().toString()); - TEST_PREP_FOR_SUBSTR_MATCH(UTF8("сколько тебе лет?"), test1, kDiacriticSensitive, kNormal); - TEST_PREP_FOR_SUBSTR_MATCH( + TEST_CASE_FOLD_AND_STRIP_DIACRITICS( + UTF8("сколько тебе лет?"), test1, kDiacriticSensitive, kNormal); + TEST_CASE_FOLD_AND_STRIP_DIACRITICS( UTF8("¿cuántos años tienes tú?"), test2, kDiacriticSensitive, kNormal); } @@ -129,8 +123,10 @@ TEST(UnicodeString, CaseFoldingTurkish) { ASSERT_EQUALS(UTF8("kac yasindasiniz"), String(test2).toLower(CaseFoldMode::kTurkish).toString()); - TEST_PREP_FOR_SUBSTR_MATCH(UTF8("kac yasındasınız"), test1, kDiacriticSensitive, kTurkish); - TEST_PREP_FOR_SUBSTR_MATCH(UTF8("kac yasindasiniz"), test2, kDiacriticSensitive, kTurkish); + TEST_CASE_FOLD_AND_STRIP_DIACRITICS( + UTF8("kac yasındasınız"), test1, kDiacriticSensitive, kTurkish); + TEST_CASE_FOLD_AND_STRIP_DIACRITICS( + UTF8("kac yasindasiniz"), test2, kDiacriticSensitive, kTurkish); } TEST(UnicodeString, CaseFoldingAndRemoveDiacritics) { @@ -147,9 +143,9 @@ TEST(UnicodeString, CaseFoldingAndRemoveDiacritics) { String(test2).toLower().removeDiacritics().toString()); ASSERT_EQUALS(UTF8("cafe"), String(test3).toLower().removeDiacritics().toString()); - TEST_PREP_FOR_SUBSTR_MATCH(UTF8("ποσο χρονων εισαι?"), test1, 0, kNormal); - TEST_PREP_FOR_SUBSTR_MATCH(UTF8("¿cuantos anos tienes tu?"), test2, 0, kNormal); - TEST_PREP_FOR_SUBSTR_MATCH(UTF8("cafe"), test3, 0, kNormal); + TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("ποσο χρονων εισαι?"), test1, 0, kNormal); + TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("¿cuantos anos tienes tu?"), test2, 0, kNormal); + TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("cafe"), test3, 0, kNormal); } TEST(UnicodeString, SubstringMatch) { @@ -218,13 +214,13 @@ TEST(UnicodeString, BadUTF8) { ASSERT_THROWS(String test3(invalid3), AssertionException); ASSERT_THROWS(String test4(invalid4), AssertionException); - // preForSubstrMatch doesn't make any guarantees about behavior when fed invalid utf8. + // caseFoldAndStripDiacritics doesn't make any guarantees about behavior when fed invalid utf8. // These calls are to ensure that they don't trigger any faults in sanitizing builds. - String::prepForSubstrMatch(invalid1, 0, kNormal); - String::prepForSubstrMatch(invalid2, 0, kNormal); - String::prepForSubstrMatch(invalid3, 0, kNormal); + String::caseFoldAndStripDiacritics(invalid1, 0, kNormal); + String::caseFoldAndStripDiacritics(invalid2, 0, kNormal); + String::caseFoldAndStripDiacritics(invalid3, 0, kNormal); - ASSERT_THROWS(String::prepForSubstrMatch(invalid4, 0, kNormal), AssertionException); + ASSERT_THROWS(String::caseFoldAndStripDiacritics(invalid4, 0, kNormal), AssertionException); } TEST(UnicodeString, UTF32ToUTF8) { |