summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMathias Stearn <mathias@10gen.com>2016-02-29 12:49:58 -0500
committerMathias Stearn <mathias@10gen.com>2016-03-11 08:50:18 -0500
commit35f4f2f5a58e5dc90b583e8bc6089eaa2d83e065 (patch)
tree7be60b0a571c00be6bf65724bfd981d5e2a9f400
parent67eee08bb606537df7417670d423c0527dd6221f (diff)
downloadmongo-35f4f2f5a58e5dc90b583e8bc6089eaa2d83e065.tar.gz
SERVER-19936 Rename unicode::string::prepForSubstrMatch and make easier to use
-rw-r--r--src/mongo/db/fts/unicode/string.cpp25
-rw-r--r--src/mongo/db/fts/unicode/string.h37
-rw-r--r--src/mongo/db/fts/unicode/string_test.cpp52
3 files changed, 65 insertions, 49 deletions
diff --git a/src/mongo/db/fts/unicode/string.cpp b/src/mongo/db/fts/unicode/string.cpp
index 483cb7b22eb..3218f04dbf7 100644
--- a/src/mongo/db/fts/unicode/string.cpp
+++ b/src/mongo/db/fts/unicode/string.cpp
@@ -158,11 +158,13 @@ void String::removeDiacriticsToBuf(String& buffer) const {
buffer._needsOutputConversion = true;
}
-std::pair<std::unique_ptr<char[]>, char*> String::prepForSubstrMatch(StringData utf8,
- SubstrMatchOptions options,
- CaseFoldMode mode) {
- // This function should only be called when casefolding or stripping diacritics.
- dassert(!(options & kCaseSensitive) || !(options & kDiacriticSensitive));
+String::MaybeOwnedStringData String::caseFoldAndStripDiacritics(StringData utf8,
+ SubstrMatchOptions options,
+ CaseFoldMode mode) {
+ if ((options & kCaseSensitive) && (options & kDiacriticSensitive)) {
+ // No transformation needed. Just return the input data unmodified.
+ return utf8;
+ }
// Allocate space for up to 2x growth which is the worst possible case for stripping diacritics
// and casefolding. Proof: the only case where 1 byte goes to >1 is 'I' in Turkish going to 2
@@ -286,19 +288,12 @@ bool String::substrMatch(const std::string& str,
options &= ~kCaseSensitive;
}
- if ((options & kCaseSensitive) && (options & kDiacriticSensitive)) {
- // No transformation needed. Just do the search on the input strings.
- return boost::algorithm::boyer_moore_search(
- str.cbegin(), str.cend(), find.cbegin(), find.cend()) != str.cend();
- }
-
- auto haystack = prepForSubstrMatch(str, options, cfMode);
- auto needle = prepForSubstrMatch(find, options, cfMode);
+ auto haystack = caseFoldAndStripDiacritics(str, options, cfMode);
+ auto needle = caseFoldAndStripDiacritics(find, options, cfMode);
// Case sensitive and diacritic sensitive.
return boost::algorithm::boyer_moore_search(
- haystack.first.get(), haystack.second, needle.first.get(), needle.second) !=
- haystack.second;
+ haystack.begin(), haystack.end(), needle.begin(), needle.end()) != haystack.end();
}
} // namespace unicode
diff --git a/src/mongo/db/fts/unicode/string.h b/src/mongo/db/fts/unicode/string.h
index 1ba6e46c27c..64a0d89918b 100644
--- a/src/mongo/db/fts/unicode/string.h
+++ b/src/mongo/db/fts/unicode/string.h
@@ -45,6 +45,34 @@ namespace unicode {
*/
class String {
public:
+ /**
+ * A StringData that may own its own buffer.
+ */
+ class MaybeOwnedStringData : public StringData {
+ public:
+ /**
+ * Makes an empty, unowned string.
+ */
+ MaybeOwnedStringData() = default;
+
+ /**
+ * Makes an owned string.
+ */
+ MaybeOwnedStringData(std::unique_ptr<char[]>&& buffer, const char* endIt)
+ : StringData(buffer.get(), endIt - buffer.get()), _buffer(std::move(buffer)) {}
+
+ /**
+ * Makes an unowned string.
+ */
+ /*implicit*/ MaybeOwnedStringData(StringData str) : StringData(str) {}
+ MaybeOwnedStringData& operator=(StringData str) {
+ return (*this = MaybeOwnedStringData(str));
+ }
+
+ private:
+ std::unique_ptr<char[]> _buffer;
+ };
+
String() = default;
#if defined(_MSC_VER) && _MSC_VER < 1900
@@ -155,13 +183,10 @@ public:
/**
* Strips diacritics and case-folds the utf8 input string, as needed to support options.
- *
- * Returns an owned buffer containing the output utf8 string and an end iterator for the string
- * (points at the first byte after the string).
*/
- static std::pair<std::unique_ptr<char[]>, char*> prepForSubstrMatch(StringData utf8,
- SubstrMatchOptions options,
- CaseFoldMode mode);
+ static MaybeOwnedStringData caseFoldAndStripDiacritics(StringData utf8,
+ SubstrMatchOptions options,
+ CaseFoldMode mode);
private:
/**
diff --git a/src/mongo/db/fts/unicode/string_test.cpp b/src/mongo/db/fts/unicode/string_test.cpp
index 8dec9a7e8df..d627120e9e7 100644
--- a/src/mongo/db/fts/unicode/string_test.cpp
+++ b/src/mongo/db/fts/unicode/string_test.cpp
@@ -52,14 +52,6 @@ namespace {
// implementation.
const std::string filler(32, 'x');
-/**
- * Converts return from prepForSubstrMatch to a StringData that is only useful in the current
- * expression.
- */
-StringData toStringData(std::pair<std::unique_ptr<char[]>, char*> result) {
- return {result.first.get(), size_t(result.second - result.first.get())};
-}
-
auto kDiacriticSensitive = String::kDiacriticSensitive;
auto kCaseSensitive = String::kCaseSensitive;
@@ -69,10 +61,10 @@ auto kNormal = CaseFoldMode::kNormal;
// Macro to preserve line numbers and arguments in error messages.
-#define TEST_PREP_FOR_SUBSTR_MATCH(expected, input, options, caseFoldMode) \
- ASSERT_EQ(expected, toStringData(String::prepForSubstrMatch(input, options, caseFoldMode))); \
- ASSERT_EQ(expected + filler, \
- toStringData(String::prepForSubstrMatch(input + filler, options, caseFoldMode)))
+#define TEST_CASE_FOLD_AND_STRIP_DIACRITICS(expected, input, options, caseFoldMode) \
+ ASSERT_EQ(expected, String::caseFoldAndStripDiacritics(input, options, caseFoldMode)); \
+ ASSERT_EQ(expected + filler, \
+ String::caseFoldAndStripDiacritics(input + filler, options, caseFoldMode))
TEST(UnicodeString, RemoveDiacritics) {
// Test all ascii chars.
@@ -82,7 +74,7 @@ TEST(UnicodeString, RemoveDiacritics) {
if (ch) { // String's constructor doesn't handle embedded NUL bytes.
ASSERT_EQUALS(output, String(input).removeDiacritics().toString());
}
- TEST_PREP_FOR_SUBSTR_MATCH(output, input, kCaseSensitive, kNormal);
+ TEST_CASE_FOLD_AND_STRIP_DIACRITICS(output, input, kCaseSensitive, kNormal);
}
// NFC Normalized Text.
@@ -94,8 +86,9 @@ TEST(UnicodeString, RemoveDiacritics) {
ASSERT_EQUALS(UTF8("¿CUANTOS ANOS TIENES TU?"), String(test1).removeDiacritics().toString());
ASSERT_EQUALS(UTF8("Cafe"), String(test2).removeDiacritics().toString());
- TEST_PREP_FOR_SUBSTR_MATCH(UTF8("¿CUANTOS ANOS TIENES TU?"), test1, kCaseSensitive, kNormal);
- TEST_PREP_FOR_SUBSTR_MATCH(UTF8("Cafe"), test2, kCaseSensitive, kNormal);
+ TEST_CASE_FOLD_AND_STRIP_DIACRITICS(
+ UTF8("¿CUANTOS ANOS TIENES TU?"), test1, kCaseSensitive, kNormal);
+ TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("Cafe"), test2, kCaseSensitive, kNormal);
}
TEST(UnicodeString, CaseFolding) {
@@ -106,7 +99,7 @@ TEST(UnicodeString, CaseFolding) {
if (ch) { // String's constructor doesn't handle embedded NUL bytes.
ASSERT_EQUALS(lower, String(upper).toLower().toString());
}
- TEST_PREP_FOR_SUBSTR_MATCH(lower, upper, kDiacriticSensitive, kNormal);
+ TEST_CASE_FOLD_AND_STRIP_DIACRITICS(lower, upper, kDiacriticSensitive, kNormal);
}
const char test1[] = UTF8("СКОЛЬКО ТЕБЕ ЛЕТ?");
@@ -115,8 +108,9 @@ TEST(UnicodeString, CaseFolding) {
ASSERT_EQUALS(UTF8("сколько тебе лет?"), String(test1).toLower().toString());
ASSERT_EQUALS(UTF8("¿cuántos años tienes tú?"), String(test2).toLower().toString());
- TEST_PREP_FOR_SUBSTR_MATCH(UTF8("сколько тебе лет?"), test1, kDiacriticSensitive, kNormal);
- TEST_PREP_FOR_SUBSTR_MATCH(
+ TEST_CASE_FOLD_AND_STRIP_DIACRITICS(
+ UTF8("сколько тебе лет?"), test1, kDiacriticSensitive, kNormal);
+ TEST_CASE_FOLD_AND_STRIP_DIACRITICS(
UTF8("¿cuántos años tienes tú?"), test2, kDiacriticSensitive, kNormal);
}
@@ -129,8 +123,10 @@ TEST(UnicodeString, CaseFoldingTurkish) {
ASSERT_EQUALS(UTF8("kac yasindasiniz"),
String(test2).toLower(CaseFoldMode::kTurkish).toString());
- TEST_PREP_FOR_SUBSTR_MATCH(UTF8("kac yasındasınız"), test1, kDiacriticSensitive, kTurkish);
- TEST_PREP_FOR_SUBSTR_MATCH(UTF8("kac yasindasiniz"), test2, kDiacriticSensitive, kTurkish);
+ TEST_CASE_FOLD_AND_STRIP_DIACRITICS(
+ UTF8("kac yasındasınız"), test1, kDiacriticSensitive, kTurkish);
+ TEST_CASE_FOLD_AND_STRIP_DIACRITICS(
+ UTF8("kac yasindasiniz"), test2, kDiacriticSensitive, kTurkish);
}
TEST(UnicodeString, CaseFoldingAndRemoveDiacritics) {
@@ -147,9 +143,9 @@ TEST(UnicodeString, CaseFoldingAndRemoveDiacritics) {
String(test2).toLower().removeDiacritics().toString());
ASSERT_EQUALS(UTF8("cafe"), String(test3).toLower().removeDiacritics().toString());
- TEST_PREP_FOR_SUBSTR_MATCH(UTF8("ποσο χρονων εισαι?"), test1, 0, kNormal);
- TEST_PREP_FOR_SUBSTR_MATCH(UTF8("¿cuantos anos tienes tu?"), test2, 0, kNormal);
- TEST_PREP_FOR_SUBSTR_MATCH(UTF8("cafe"), test3, 0, kNormal);
+ TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("ποσο χρονων εισαι?"), test1, 0, kNormal);
+ TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("¿cuantos anos tienes tu?"), test2, 0, kNormal);
+ TEST_CASE_FOLD_AND_STRIP_DIACRITICS(UTF8("cafe"), test3, 0, kNormal);
}
TEST(UnicodeString, SubstringMatch) {
@@ -218,13 +214,13 @@ TEST(UnicodeString, BadUTF8) {
ASSERT_THROWS(String test3(invalid3), AssertionException);
ASSERT_THROWS(String test4(invalid4), AssertionException);
- // preForSubstrMatch doesn't make any guarantees about behavior when fed invalid utf8.
+ // caseFoldAndStripDiacritics doesn't make any guarantees about behavior when fed invalid utf8.
// These calls are to ensure that they don't trigger any faults in sanitizing builds.
- String::prepForSubstrMatch(invalid1, 0, kNormal);
- String::prepForSubstrMatch(invalid2, 0, kNormal);
- String::prepForSubstrMatch(invalid3, 0, kNormal);
+ String::caseFoldAndStripDiacritics(invalid1, 0, kNormal);
+ String::caseFoldAndStripDiacritics(invalid2, 0, kNormal);
+ String::caseFoldAndStripDiacritics(invalid3, 0, kNormal);
- ASSERT_THROWS(String::prepForSubstrMatch(invalid4, 0, kNormal), AssertionException);
+ ASSERT_THROWS(String::caseFoldAndStripDiacritics(invalid4, 0, kNormal), AssertionException);
}
TEST(UnicodeString, UTF32ToUTF8) {