SERVER-19936 Rename unicode::string::prepForSubstrMatch and make easier to use

author: Mathias Stearn <mathias@10gen.com> 2016-02-29 12:49:58 -0500
committer: Mathias Stearn <mathias@10gen.com> 2016-03-11 08:50:18 -0500
commit: 35f4f2f5a58e5dc90b583e8bc6089eaa2d83e065 (patch)
tree: 7be60b0a571c00be6bf65724bfd981d5e2a9f400
parent: 67eee08bb606537df7417670d423c0527dd6221f (diff)
download: mongo-35f4f2f5a58e5dc90b583e8bc6089eaa2d83e065.tar.gz
3 files changed, 65 insertions, 49 deletions
diff --git a/src/mongo/db/fts/unicode/string.cpp b/src/mongo/db/fts/unicode/string.cpp
index 483cb7b22eb..3218f04dbf7 100644
--- a/src/mongo/db/fts/unicode/string.cpp
+++ b/src/mongo/db/fts/unicode/string.cpp
@@ -158,11 +158,13 @@ void String::removeDiacriticsToBuf(String& buffer) const {
     buffer._needsOutputConversion = true;
 }
 
-std::pair<std::unique_ptr<char[]>, char*> String::prepForSubstrMatch(StringData utf8,
-                                                                     SubstrMatchOptions options,
-                                                                     CaseFoldMode mode) {
-    // This function should only be called when casefolding or stripping diacritics.
-    dassert(!(options & kCaseSensitive) || !(options & kDiacriticSensitive));
+String::MaybeOwnedStringData String::caseFoldAndStripDiacritics(StringData utf8,
+                                                                SubstrMatchOptions options,
+                                                                CaseFoldMode mode) {
+    if ((options & kCaseSensitive) && (options & kDiacriticSensitive)) {
+        // No transformation needed. Just return the input data unmodified.
+        return utf8;
+    }
 
     // Allocate space for up to 2x growth which is the worst possible case for stripping diacritics
     // and casefolding. Proof: the only case where 1 byte goes to >1 is 'I' in Turkish going to 2
@@ -286,19 +288,12 @@ bool String::substrMatch(const std::string& str,
         options &= ~kCaseSensitive;
     }
 
-    if ((options & kCaseSensitive) && (options & kDiacriticSensitive)) {
-        // No transformation needed. Just do the search on the input strings.
-        return boost::algorithm::boyer_moore_search(
-                   str.cbegin(), str.cend(), find.cbegin(), find.cend()) != str.cend();
-    }
-
-    auto haystack = prepForSubstrMatch(str, options, cfMode);
-    auto needle = prepForSubstrMatch(find, options, cfMode);
+    auto haystack = caseFoldAndStripDiacritics(str, options, cfMode);
+    auto needle = caseFoldAndStripDiacritics(find, options, cfMode);
 
     // Case sensitive and diacritic sensitive.
     return boost::algorithm::boyer_moore_search(
-               haystack.first.get(), haystack.second, needle.first.get(), needle.second) !=
-        haystack.second;
+               haystack.begin(), haystack.end(), needle.begin(), needle.end()) != haystack.end();
 }
 
 }  // namespace unicode
diff --git a/src/mongo/db/fts/unicode/string.h b/src/mongo/db/fts/unicode/string.h
index 1ba6e46c27c..64a0d89918b 100644
--- a/src/mongo/db/fts/unicode/string.h
+++ b/src/mongo/db/fts/unicode/string.h
@@ -45,6 +45,34 @@ namespace unicode {
  */
 class String {
 public:
+    /**
+     * A StringData that may own its own buffer.
+     */
+    class MaybeOwnedStringData : public StringData {
+    public:
+        /**
+         * Makes an empty, unowned string.
+         */
+        MaybeOwnedStringData() = default;
+
+        /**
+         * Makes an owned string.
+         */
+        MaybeOwnedStringData(std::unique_ptr<char[]>&& buffer, const char* endIt)
+            : StringData(buffer.get(), endIt - buffer.get()), _buffer(std::move(buffer)) {}
+
+        /**
+         * Makes an unowned string.
+         */
+        /*implicit*/ MaybeOwnedStringData(StringData str) : StringData(str) {}
+        MaybeOwnedStringData& operator=(StringData str) {
+            return (*this = MaybeOwnedStringData(str));
+        }
+
+    private:
+        std::unique_ptr<char[]> _buffer;
+    };
+
     String() = default;
 
 #if defined(_MSC_VER) && _MSC_VER < 1900
@@ -155,13 +183,10 @@ public:
 
     /**
      * Strips diacritics and case-folds the utf8 input string, as needed to support options.
-     *
-     * Returns an owned buffer containing the output utf8 string and an end iterator for the string
-     * (points at the first byte after the string).
      */
-    static std::pair<std::unique_ptr<char[]>, char*> prepForSubstrMatch(StringData utf8,
-                                                                        SubstrMatchOptions options,
-                                                                        CaseFoldMode mode);
+    static MaybeOwnedStringData caseFoldAndStripDiacritics(StringData utf8,
+                                                           SubstrMatchOptions options,
+                                                           CaseFoldMode mode);
 
 private:
     /**
diff --git a/src/mongo/db/fts/unicode/string_test.cpp b/src/mongo/db/fts/unicode/string_test.cpp
index 8dec9a7e8df..d627120e9e7 100644
--- a/src/mongo/db/fts/unicode/string_test.cpp
+++ b/src/mongo/db/fts/unicode/string_test.cpp
@@ -52,14 +52,6 @@ namespace {
 // implementation.
 const std::string filler(32, 'x');
 
-/**
- * Converts return from prepForSubstrMatch to a StringData that is only useful in the current
- * expression.
- */
-StringData toStringData(std::pair<std::unique_ptr<char[]>, char*> result) {
-    return {result.first.get(), size_t(result.second - result.first.get())};
-}
-
 auto kDiacriticSensitive = String::kDiacriticSensitive;
 auto kCaseSensitive = String::kCaseSensitive;
 
@@ -69,10 +61,10 @@ auto kNormal = CaseFoldMode::kNormal;
 
 
 // Macro to preserve line numbers and arguments in error messages.
-#define TEST_PREP_FOR_SUBSTR_MATCH(expected, input, options, caseFoldMode)                       \
-    ASSERT_EQ(expected, toStringData(String::prepForSubstrMatch(input, options, caseFoldMode))); \
-    ASSERT_EQ(expected + filler,                                                                 \
-              toStringData(String::prepForSubstrMatch(input + filler, options, caseFoldMode)))
+#define TEST_CASE_FOLD_AND_STRIP_DIACRITICS(expected, input, options, caseFoldMode)        \
+    ASSERT_EQ(expected, String::caseFoldAndStripDiacritics(input, options, caseFoldMode)); \
+    ASSERT_EQ(expected + filler,                                                           \
+              String::caseFoldAndStripDiacritics(input + filler, options, caseFoldMode))
 
 TEST(UnicodeString, RemoveDiacritics) {
     // Test all ascii chars.
@@ -82,7 +74,7 @@ TEST(UnicodeString, RemoveDiacritics) {
         if (ch) {  // String's constructor doesn't handle embedded NUL bytes.
             ASSERT_EQUALS(output, String(input).removeDiacritics().toString());
         }
-        TEST_PREP_FOR_SUBSTR_MATCH(output, input, kCaseSensitive, kNormal);
+        TEST_CASE_FOLD_AND_STRIP_DIACRITICS(output, input, kCaseSensitive, kNormal);
     }
 
     // NFC Normalized Text.
@@ -94,8 +86,9 @@ TEST(UnicodeString, RemoveDiacritics) {
     ASSERT_EQUALS(UTF8("¿CUANTOS ANOS TIENES TU?"), String(test1).removeDiacritics().toString());
     ASSERT_EQUALS(UTF8("Cafe"), String(test2).removeDiacritics().toString());
 
-    TEST_PREP_FOR_SUBSTR_MATCH(UTF8("¿CUANTOS ANOS TIENES TU?"), test1, kCaseSensitive, kNormal);
-    TEST_PREP_FOR_SUBSTR_MATCH(UTF8("Cafe"), test2, kCaseSensitive, kNormal);
+    TEST_CASE_FOLD_AND_STRIP_DIACRITICS(
+        UTF8("¿CUANTOS ANOS TIENES TU?"), test1, kCaseSensitive, kNormal);
+    TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("Cafe"), test2, kCaseSensitive, kNormal);
 }
 
 TEST(UnicodeString, CaseFolding) {
@@ -106,7 +99,7 @@ TEST(UnicodeString, CaseFolding) {
         if (ch) {  // String's constructor doesn't handle embedded NUL bytes.
             ASSERT_EQUALS(lower, String(upper).toLower().toString());
         }
-        TEST_PREP_FOR_SUBSTR_MATCH(lower, upper, kDiacriticSensitive, kNormal);
+        TEST_CASE_FOLD_AND_STRIP_DIACRITICS(lower, upper, kDiacriticSensitive, kNormal);
     }
 
     const char test1[] = UTF8("СКОЛЬКО ТЕБЕ ЛЕТ?");
@@ -115,8 +108,9 @@ TEST(UnicodeString, CaseFolding) {
     ASSERT_EQUALS(UTF8("сколько тебе лет?"), String(test1).toLower().toString());
     ASSERT_EQUALS(UTF8("¿cuántos años tienes tú?"), String(test2).toLower().toString());
 
-    TEST_PREP_FOR_SUBSTR_MATCH(UTF8("сколько тебе лет?"), test1, kDiacriticSensitive, kNormal);
-    TEST_PREP_FOR_SUBSTR_MATCH(
+    TEST_CASE_FOLD_AND_STRIP_DIACRITICS(
+        UTF8("сколько тебе лет?"), test1, kDiacriticSensitive, kNormal);
+    TEST_CASE_FOLD_AND_STRIP_DIACRITICS(
         UTF8("¿cuántos años tienes tú?"), test2, kDiacriticSensitive, kNormal);
 }
 
@@ -129,8 +123,10 @@ TEST(UnicodeString, CaseFoldingTurkish) {
     ASSERT_EQUALS(UTF8("kac yasindasiniz"),
                   String(test2).toLower(CaseFoldMode::kTurkish).toString());
 
-    TEST_PREP_FOR_SUBSTR_MATCH(UTF8("kac yasındasınız"), test1, kDiacriticSensitive, kTurkish);
-    TEST_PREP_FOR_SUBSTR_MATCH(UTF8("kac yasindasiniz"), test2, kDiacriticSensitive, kTurkish);
+    TEST_CASE_FOLD_AND_STRIP_DIACRITICS(
+        UTF8("kac yasındasınız"), test1, kDiacriticSensitive, kTurkish);
+    TEST_CASE_FOLD_AND_STRIP_DIACRITICS(
+        UTF8("kac yasindasiniz"), test2, kDiacriticSensitive, kTurkish);
 }
 
 TEST(UnicodeString, CaseFoldingAndRemoveDiacritics) {
@@ -147,9 +143,9 @@ TEST(UnicodeString, CaseFoldingAndRemoveDiacritics) {
                   String(test2).toLower().removeDiacritics().toString());
     ASSERT_EQUALS(UTF8("cafe"), String(test3).toLower().removeDiacritics().toString());
 
-    TEST_PREP_FOR_SUBSTR_MATCH(UTF8("ποσο χρονων εισαι?"), test1, 0, kNormal);
-    TEST_PREP_FOR_SUBSTR_MATCH(UTF8("¿cuantos anos tienes tu?"), test2, 0, kNormal);
-    TEST_PREP_FOR_SUBSTR_MATCH(UTF8("cafe"), test3, 0, kNormal);
+    TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("ποσο χρονων εισαι?"), test1, 0, kNormal);
+    TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("¿cuantos anos tienes tu?"), test2, 0, kNormal);
+    TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("cafe"), test3, 0, kNormal);
 }
 
 TEST(UnicodeString, SubstringMatch) {
@@ -218,13 +214,13 @@ TEST(UnicodeString, BadUTF8) {
     ASSERT_THROWS(String test3(invalid3), AssertionException);
     ASSERT_THROWS(String test4(invalid4), AssertionException);
 
-    // preForSubstrMatch doesn't make any guarantees about behavior when fed invalid utf8.
+    // caseFoldAndStripDiacritics doesn't make any guarantees about behavior when fed invalid utf8.
     // These calls are to ensure that they don't trigger any faults in sanitizing builds.
-    String::prepForSubstrMatch(invalid1, 0, kNormal);
-    String::prepForSubstrMatch(invalid2, 0, kNormal);
-    String::prepForSubstrMatch(invalid3, 0, kNormal);
+    String::caseFoldAndStripDiacritics(invalid1, 0, kNormal);
+    String::caseFoldAndStripDiacritics(invalid2, 0, kNormal);
+    String::caseFoldAndStripDiacritics(invalid3, 0, kNormal);
 
-    ASSERT_THROWS(String::prepForSubstrMatch(invalid4, 0, kNormal), AssertionException);
+    ASSERT_THROWS(String::caseFoldAndStripDiacritics(invalid4, 0, kNormal), AssertionException);
 }
 
 TEST(UnicodeString, UTF32ToUTF8) {
author	Mathias Stearn <mathias@10gen.com>	2016-02-29 12:49:58 -0500
committer	Mathias Stearn <mathias@10gen.com>	2016-03-11 08:50:18 -0500
commit	35f4f2f5a58e5dc90b583e8bc6089eaa2d83e065 (patch)
tree	7be60b0a571c00be6bf65724bfd981d5e2a9f400
parent	67eee08bb606537df7417670d423c0527dd6221f (diff)
download	mongo-35f4f2f5a58e5dc90b583e8bc6089eaa2d83e065.tar.gz